| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 | #include "captions-mssapi.hpp"#define do_log(type, format, ...) \	blog(type, "[Captions] " format, ##__VA_ARGS__)#define error(format, ...) do_log(LOG_ERROR, format, ##__VA_ARGS__)#define debug(format, ...) do_log(LOG_DEBUG, format, ##__VA_ARGS__)mssapi_captions::mssapi_captions(captions_cb callback, const std::string &lang)try : captions_handler(callback, AUDIO_FORMAT_16BIT, 16000) {	HRESULT hr;	std::wstring wlang;	wlang.resize(lang.size());	for (size_t i = 0; i < lang.size(); i++)		wlang[i] = (wchar_t)lang[i];	LCID lang_id = LocaleNameToLCID(wlang.c_str(), 0);	wchar_t lang_str[32];	_snwprintf(lang_str, 31, L"language=%x", (int)lang_id);	stop = CreateEvent(nullptr, false, false, nullptr);	if (!stop.Valid())		throw "Failed to create event";	hr = SpFindBestToken(SPCAT_RECOGNIZERS, lang_str, nullptr, &token);	if (FAILED(hr))		throw HRError("SpFindBestToken failed", hr);	hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_ALL,			      __uuidof(ISpRecognizer), (void **)&recognizer);	if (FAILED(hr))		throw HRError("CoCreateInstance for recognizer failed", hr);	hr = recognizer->SetRecognizer(token);	if (FAILED(hr))		throw HRError("SetRecognizer failed", hr);	hr = recognizer->SetRecoState(SPRST_INACTIVE);	if (FAILED(hr))		throw HRError("SetRecoState(SPRST_INACTIVE) failed", hr);	hr = recognizer->CreateRecoContext(&context);	if (FAILED(hr))		throw HRError("CreateRecoContext failed", hr);	ULONGLONG interest = SPFEI(SPEI_RECOGNITION) |			     SPFEI(SPEI_END_SR_STREAM);	hr = context->SetInterest(interest, interest);	if (FAILED(hr))		throw HRError("SetInterest failed", hr);	hr = context->SetNotifyWin32Event();	if (FAILED(hr))		throw HRError("SetNotifyWin32Event", hr);	notify = context->GetNotifyEventHandle();	if (notify == INVALID_HANDLE_VALUE)		throw HRError("GetNotifyEventHandle failed", E_NOINTERFACE);	size_t sample_rate = audio_output_get_sample_rate(obs_get_audio());	audio = new CaptionStream((DWORD)sample_rate, this);	audio->Release();	hr = recognizer->SetInput(audio, false);	if (FAILED(hr))		throw HRError("SetInput failed", hr);	hr = context->CreateGrammar(1, &grammar);	if (FAILED(hr))		throw HRError("CreateGrammar failed", hr);	hr = grammar->LoadDictation(nullptr, SPLO_STATIC);	if (FAILED(hr))		throw HRError("LoadDictation failed", hr);	try {		t = std::thread([this]() { main_thread(); });	} catch (...) {		throw "Failed to create thread";	}} catch (const char *err) {	blog(LOG_WARNING, "%s: %s", __FUNCTION__, err);	throw CAPTIONS_ERROR_GENERIC_FAIL;} catch (HRError err) {	blog(LOG_WARNING, "%s: %s (%lX)", __FUNCTION__, err.str, err.hr);	throw CAPTIONS_ERROR_GENERIC_FAIL;}mssapi_captions::~mssapi_captions(){	if (t.joinable()) {		SetEvent(stop);		t.join();	}}void mssapi_captions::main_thread()try {	HRESULT hr;	os_set_thread_name(__FUNCTION__);	hr = grammar->SetDictationState(SPRS_ACTIVE);	if (FAILED(hr))		throw HRError("SetDictationState failed", hr);	hr = recognizer->SetRecoState(SPRST_ACTIVE);	if (FAILED(hr))		throw HRError("SetRecoState(SPRST_ACTIVE) failed", hr);	HANDLE events[] = {notify, stop};	started = true;	for (;;) {		DWORD ret = WaitForMultipleObjects(2, events, false, INFINITE);		if (ret != WAIT_OBJECT_0)			break;		CSpEvent event;		bool exit = false;		while (event.GetFrom(context) == S_OK) {			if (event.eEventId == SPEI_RECOGNITION) {				ISpRecoResult *result = event.RecoResult();				CoTaskMemPtr<wchar_t> text;				hr = result->GetText((ULONG)-1, (ULONG)-1, true,						     &text, nullptr);				if (FAILED(hr))					continue;				char text_utf8[512];				os_wcs_to_utf8(text, 0, text_utf8, 512);				callback(text_utf8);				blog(LOG_DEBUG, "\"%s\"", text_utf8);			} else if (event.eEventId == SPEI_END_SR_STREAM) {				exit = true;				break;			}		}		if (exit)			break;	}	audio->Stop();} catch (HRError err) {	blog(LOG_WARNING, "%s failed: %s (%lX)", __FUNCTION__, err.str, err.hr);}void mssapi_captions::pcm_data(const void *data, size_t frames){	if (started)		audio->PushAudio(data, frames);}captions_handler_info mssapi_info = {	[]() -> std::string { return "Microsoft Speech-to-Text"; },	[](captions_cb cb, const std::string &lang) -> captions_handler * {		return new mssapi_captions(cb, lang);	}};
 |