Merge branch 'master' into fix-sapi5-continuous-reading

nvaccess · Jan 10, 2025 · 23fae4e · 23fae4e
2 parents 65387a8 + 631156c
commit 23fae4e
Show file tree

Hide file tree

Showing 3 changed files with 166 additions and 132 deletions.
diff --git a/source/synthDrivers/mssp.py b/source/synthDrivers/mssp.py
@@ -9,6 +9,7 @@
 
 class SynthDriver(SynthDriver):
 	COM_CLASS = "speech.SPVoice"
+	CUSTOMSTREAM_COM_CLASS = "speech.SpCustomStream"
 
 	name = "mssp"
 	description = "Microsoft Speech Platform"
diff --git a/source/synthDrivers/sapi5.py b/source/synthDrivers/sapi5.py
@@ -4,14 +4,17 @@
 # This file is covered by the GNU General Public License.
 # See the file COPYING for more details.
 
-from typing import Optional
+from ctypes import POINTER, c_ubyte, c_wchar_p, cast, windll, _Pointer
 from enum import IntEnum
 import locale
 from collections import OrderedDict, deque
+from typing import TYPE_CHECKING
+from comInterfaces.SpeechLib import ISpEventSource, ISpNotifySource, ISpNotifySink
 import comtypes.client
-from comtypes import COMError
+from comtypes import COMError, COMObject, IUnknown, hresult, ReturnHRESULT
 import winreg
-import audioDucking
+import nvwave
+from objidl import _LARGE_INTEGER, _ULARGE_INTEGER, IStream
 from synthDriverHandler import SynthDriver, VoiceInfo, synthIndexReached, synthDoneSpeaking
 import config
 from logHandler import log
@@ -31,14 +34,6 @@
 )
 
 
-class SPAudioState(IntEnum):
-	# https://docs.microsoft.com/en-us/previous-versions/windows/desktop/ms720596(v=vs.85)
-	CLOSED = 0
-	STOP = 1
-	PAUSE = 2
-	RUN = 3
-
-
 class SpeechVoiceSpeakFlags(IntEnum):
 	# https://docs.microsoft.com/en-us/previous-versions/windows/desktop/ms720892(v=vs.85)
 	Async = 1
@@ -53,56 +48,148 @@ class SpeechVoiceEvents(IntEnum):
 	Bookmark = 16
 
 
-class SapiSink(object):
-	"""Handles SAPI event notifications.
-	See https://msdn.microsoft.com/en-us/library/ms723587(v=vs.85).aspx
+if TYPE_CHECKING:
+	LP_c_ubyte = _Pointer[c_ubyte]
+else:
+	LP_c_ubyte = POINTER(c_ubyte)
+
+
+class SynthDriverAudioStream(COMObject):
 	"""
+	Implements IStream to receive streamed-in audio data.
+	Should be wrapped in an SpCustomStream
+	(which also provides the wave format information),
+	then set as the AudioOutputStream.
+	"""
+
+	_com_interfaces_ = [IStream]
 
 	def __init__(self, synthRef: weakref.ReferenceType):
 		self.synthRef = synthRef
+		self._writtenBytes = 0
 
-	def StartStream(self, streamNum, pos):
+	def ISequentialStream_RemoteWrite(self, pv: LP_c_ubyte, cb: int) -> int:
+		"""This is called when SAPI wants to write (output) a wave data chunk.
+		:param pv: A pointer to the first wave data byte.
+		:param cb: The number of bytes to write.
+		:returns: The number of bytes written.
+		"""
 		synth = self.synthRef()
 		if synth is None:
-			log.debugWarning("Called StartStream method on SapiSink while driver is dead")
+			log.debugWarning("Called Write method on AudioStream while driver is dead")
+			return 0
+		if not synth.isSpeaking:
+			return 0
+		synth.player.feed(pv, cb)
+		self._writtenBytes += cb
+		return cb
+
+	def IStream_RemoteSeek(self, dlibMove: _LARGE_INTEGER, dwOrigin: int) -> _ULARGE_INTEGER:
+		"""This is called when SAPI wants to get the current stream position.
+		Seeking to another position is not supported.
+		:param dlibMove: The displacement to be added to the location indicated by the dwOrigin parameter.
+			Only 0 is supported.
+		:param dwOrigin: The origin for the displacement specified in dlibMove.
+			Only 1 (STREAM_SEEK_CUR) is supported.
+		:returns: The current stream position.
+		"""
+		if dwOrigin == 1 and dlibMove.QuadPart == 0:
+			# SAPI is querying the current position.
+			return _ULARGE_INTEGER(self._writtenBytes)
+		# Return E_NOTIMPL without logging an error.
+		raise ReturnHRESULT(hresult.E_NOTIMPL, None)
+
+	def IStream_Commit(self, grfCommitFlags: int):
+		"""This is called when MSSP wants to flush the written data.
+		Does nothing."""
+		pass
+
+
+class SapiSink(COMObject):
+	"""
+	Implements ISpNotifySink to handle SAPI event notifications.
+	Should be passed to ISpNotifySource::SetNotifySink().
+	Notifications will be sent on the original thread,
+	instead of being routed to the main thread.
+	"""
+
+	_com_interfaces_ = [ISpNotifySink]
+
+	def __init__(self, synthRef: weakref.ReferenceType):
+		self.synthRef = synthRef
+
+	def ISpNotifySink_Notify(self):
+		"""This is called when there's a new event notification.
+		Queued events will be retrieved."""
+		synth = self.synthRef()
+		if synth is None:
+			log.debugWarning("Called Notify method on SapiSink while driver is dead")
 			return
+		# Get all queued events
+		eventSource = synth.tts.QueryInterface(ISpEventSource)
+		while True:
+			# returned tuple: (event, numFetched)
+			eventTuple = eventSource.GetEvents(1)  # Get one event
+			if eventTuple[1] != 1:
+				break
+			event = eventTuple[0]
+			if event.eEventId == 1:  # SPEI_START_INPUT_STREAM
+				self.StartStream(event.ulStreamNum, event.ullAudioStreamOffset)
+			elif event.eEventId == 2:  # SPEI_END_INPUT_STREAM
+				self.EndStream(event.ulStreamNum, event.ullAudioStreamOffset)
+			elif event.eEventId == 4:  # SPEI_TTS_BOOKMARK
+				self.Bookmark(
+					event.ulStreamNum,
+					event.ullAudioStreamOffset,
+					cast(event.lParam, c_wchar_p).value,
+					event.wParam,
+				)
+			# free lParam
+			if event.elParamType == 1 or event.elParamType == 2:  # token or object
+				pUnk = cast(event.lParam, POINTER(IUnknown))
+				del pUnk
+			elif event.elParamType == 3 or event.elParamType == 4:  # pointer or string
+				windll.ole32.CoTaskMemFree(event.lParam)
+
+	def StartStream(self, streamNum: int, pos: int):
+		synth = self.synthRef()
 		# The stream has been started. Move the bookmark list to _streamBookmarks.
 		if streamNum in synth._streamBookmarksNew:
 			synth._streamBookmarks[streamNum] = synth._streamBookmarksNew[streamNum]
 			del synth._streamBookmarksNew[streamNum]
-		if synth._audioDucker:
-			if audioDucking._isDebug():
-				log.debug("Enabling audio ducking due to starting speech stream")
-			synth._audioDucker.enable()
+		synth.isSpeaking = True
 
-	def Bookmark(self, streamNum, pos, bookmark, bookmarkId):
+	def Bookmark(self, streamNum: int, pos: int, bookmark: str, bookmarkId: int):
 		synth = self.synthRef()
-		if synth is None:
-			log.debugWarning("Called Bookmark method on SapiSink while driver is dead")
+		if not synth.isSpeaking:
 			return
-		synthIndexReached.notify(synth=synth, index=bookmarkId)
-		# remove already triggered bookmarks
-		if streamNum in synth._streamBookmarks:
-			bookmarks = synth._streamBookmarks[streamNum]
-			while bookmarks:
-				if bookmarks.popleft() == bookmarkId:
-					break
+		# Bookmark event is raised before the audio after that point.
+		# Queue an IndexReached event at this point.
+		synth.player.feed(None, 0, lambda: self.onIndexReached(streamNum, bookmarkId))
 
-	def EndStream(self, streamNum, pos):
+	def EndStream(self, streamNum: int, pos: int):
 		synth = self.synthRef()
-		if synth is None:
-			log.debugWarning("Called Bookmark method on EndStream while driver is dead")
-			return
 		# trigger all untriggered bookmarks
 		if streamNum in synth._streamBookmarks:
 			for bookmark in synth._streamBookmarks[streamNum]:
 				synthIndexReached.notify(synth=synth, index=bookmark)
 			del synth._streamBookmarks[streamNum]
+		synth.isSpeaking = False
+		synth.player.idle()
 		synthDoneSpeaking.notify(synth=synth)
-		if synth._audioDucker:
-			if audioDucking._isDebug():
-				log.debug("Disabling audio ducking due to speech stream end")
-			synth._audioDucker.disable()
+
+	def onIndexReached(self, streamNum: int, index: int):
+		synth = self.synthRef()
+		if synth is None:
+			log.debugWarning("Called onIndexReached method on SapiSink while driver is dead")
+			return
+		synthIndexReached.notify(synth=synth, index=index)
+		# remove already triggered bookmarks
+		if streamNum in synth._streamBookmarks:
+			bookmarks = synth._streamBookmarks[streamNum]
+			while bookmarks:
+				if bookmarks.popleft() == index:
+					break
 
 
 class SynthDriver(SynthDriver):
@@ -125,6 +212,7 @@ class SynthDriver(SynthDriver):
 	supportedNotifications = {synthIndexReached, synthDoneSpeaking}
 
 	COM_CLASS = "SAPI.SPVoice"
+	CUSTOMSTREAM_COM_CLASS = "SAPI.SpCustomStream"
 
 	name = "sapi5"
 	description = "Microsoft Speech API version 5"
@@ -138,27 +226,24 @@ def check(cls):
 		except:  # noqa: E722
 			return False
 
-	ttsAudioStream = (
-		None  #: Holds the ISPAudio interface for the current voice, to aid in stopping and pausing audio
-	)
-	_audioDucker: Optional[audioDucking.AudioDucker] = None
-
 	def __init__(self, _defaultVoiceToken=None):
 		"""
 		@param _defaultVoiceToken: an optional sapi voice token which should be used as the default voice (only useful for subclasses)
 		@type _defaultVoiceToken: ISpeechObjectToken
 		"""
-		if audioDucking.isAudioDuckingSupported():
-			self._audioDucker = audioDucking.AudioDucker()
 		self._pitch = 50
+		self.player = None
+		self.isSpeaking = False
 		self._initTts(_defaultVoiceToken)
 		# key = stream num, value = deque of bookmarks
 		self._streamBookmarks = dict()  # bookmarks in currently speaking streams
 		self._streamBookmarksNew = dict()  # bookmarks for streams that haven't been started
 
 	def terminate(self):
-		self._eventsConnection = None
 		self.tts = None
+		if self.player:
+			self.player.close()
+			self.player = None
 
 	def _getAvailableVoices(self):
 		voices = OrderedDict()
@@ -222,27 +307,31 @@ def _initTts(self, voice=None):
 			# Therefore, set the voice before setting the audio output.
 			# Otherwise, we will get poor speech quality in some cases.
 			self.tts.voice = voice
-		# SAPI5 automatically selects the system default audio device, so there's no use doing work if the user has selected to use the system default.
-		# Besides, our default value is not a valid endpoint ID.
-		if (outputDevice := config.conf["audio"]["outputDevice"]) != config.conf.getConfigValidation(
-			("audio", "outputDevice"),
-		).default:
-			for audioOutput in self.tts.GetAudioOutputs():
-				# SAPI's audio output IDs are registry keys. It seems that the final path segment is the endpoint ID.
-				if audioOutput.Id.endswith(outputDevice):
-					self.tts.audioOutput = audioOutput
-					break
-		self._eventsConnection = comtypes.client.GetEvents(self.tts, SapiSink(weakref.ref(self)))
+
+		self.tts.AudioOutput = self.tts.AudioOutput  # Reset the audio and its format parameters
+		fmt = self.tts.AudioOutputStream.Format
+		wfx = fmt.GetWaveFormatEx()
+		if self.player:
+			self.player.close()
+		self.player = nvwave.WavePlayer(
+			channels=wfx.Channels,
+			samplesPerSec=wfx.SamplesPerSec,
+			bitsPerSample=wfx.BitsPerSample,
+			outputDevice=config.conf["audio"]["outputDevice"],
+		)
+		audioStream = SynthDriverAudioStream(weakref.ref(self))
+		# Use SpCustomStream to wrap our IStream implementation and the correct wave format
+		customStream = comtypes.client.CreateObject(self.CUSTOMSTREAM_COM_CLASS)
+		customStream.BaseStream = audioStream
+		customStream.Format = fmt
+		self.tts.AudioOutputStream = customStream
+
+		# Set event notify sink
 		self.tts.EventInterests = (
 			SpeechVoiceEvents.StartInputStream | SpeechVoiceEvents.Bookmark | SpeechVoiceEvents.EndInputStream
 		)
-		from comInterfaces.SpeechLib import ISpAudio
-
-		try:
-			self.ttsAudioStream = self.tts.audioOutputStream.QueryInterface(ISpAudio)
-		except COMError:
-			log.debugWarning("SAPI5 voice does not support ISPAudio")
-			self.ttsAudioStream = None
+		notifySource = self.tts.QueryInterface(ISpNotifySource)
+		notifySource.SetNotifySink(SapiSink(weakref.ref(self)))
 
 	def _set_voice(self, value):
 		tokens = self._getVoiceTokens()
@@ -390,77 +479,17 @@ def outputTags():
 
 		text = "".join(textList)
 		flags = SpeechVoiceSpeakFlags.IsXML | SpeechVoiceSpeakFlags.Async
-		# Ducking should be complete before the synth starts producing audio.
-		# For this to happen, the speech method must block until ducking is complete.
-		# Ducking should be disabled when the synth is finished producing audio.
-		# Note that there may be calls to speak with a string that results in no audio,
-		# it is important that in this case the audio does not get stuck ducked.
-		# When there is no audio produced the startStream and endStream handlers are not called.
-		# To prevent audio getting stuck ducked, it is unducked at the end of speech.
-		# There are some known issues:
-		# - When there is no audio produced by the synth, a user may notice volume lowering (ducking) temporarily.
-		# - If the call to startStream handler is delayed significantly, users may notice a variation in volume
-		# (as ducking is disabled at the end of speak, and re-enabled when the startStream handler is called)
-
-		# A note on the synchronicity of components of this approach:
-		# SAPISink.StartStream event handler (callback):
-		# the synth speech is not blocked by this event callback.
-		# SAPISink.EndStream event handler (callback):
-		# assumed also to be async but not confirmed. Synchronicity is irrelevant to the current approach.
-		# AudioDucker.disable returns before the audio is completely unducked.
-		# AudioDucker.enable() ducking will complete before the function returns.
-		# It is not possible to "double duck the audio", calling twice yields the same result as calling once.
-		# AudioDucker class instances count the number of enables/disables,
-		# in order to unduck there must be no remaining enabled audio ducker instances.
-		# Due to this a temporary audio ducker is used around the call to speak.
-		# SAPISink.StartStream: Ducking here may allow the early speech to start before ducking is completed.
-		if audioDucking.isAudioDuckingSupported():
-			tempAudioDucker = audioDucking.AudioDucker()
-		else:
-			tempAudioDucker = None
-		if tempAudioDucker:
-			if audioDucking._isDebug():
-				log.debug("Enabling audio ducking due to speak call")
-			tempAudioDucker.enable()
-		try:
-			streamNum = self.tts.Speak(text, flags)
-			# When Speak returns, the previous stream may not have been ended.
-			# So the bookmark list is stored in another dict until this stream starts.
-			self._streamBookmarksNew[streamNum] = bookmarks
-		finally:
-			if tempAudioDucker:
-				if audioDucking._isDebug():
-					log.debug("Disabling audio ducking  after speak call")
-				tempAudioDucker.disable()
+		streamNum = self.tts.Speak(text, flags)
+		# When Speak returns, the previous stream may not have been ended.
+		# So the bookmark list is stored in another dict until this stream starts.
+		self._streamBookmarksNew[streamNum] = bookmarks
 
 	def cancel(self):
 		# SAPI5's default means of stopping speech can sometimes lag at end of speech, especially with Win8 / Win 10 Microsoft Voices.
-		# Therefore  instruct the underlying audio interface to stop first, before interupting and purging any remaining speech.
-		if self.ttsAudioStream:
-			self.ttsAudioStream.setState(SPAudioState.STOP, 0)
+		# Therefore  instruct the audio player to stop first, before interupting and purging any remaining speech.
+		self.isSpeaking = False
+		self.player.stop()
 		self.tts.Speak(None, SpeechVoiceSpeakFlags.Async | SpeechVoiceSpeakFlags.PurgeBeforeSpeak)
-		if self._audioDucker:
-			if audioDucking._isDebug():
-				log.debug("Disabling audio ducking due to setting output audio state to stop")
-			self._audioDucker.disable()
 
 	def pause(self, switch: bool):
-		# SAPI5's default means of pausing in most cases is either extremely slow
-		# (e.g. takes more than half a second) or does not work at all.
-		# Therefore instruct the underlying audio interface to pause instead.
-		if self.ttsAudioStream:
-			oldState = self.ttsAudioStream.GetStatus().State
-			if switch and oldState == SPAudioState.RUN:
-				# pausing
-				if self._audioDucker:
-					if audioDucking._isDebug():
-						log.debug("Disabling audio ducking due to setting output audio state to pause")
-					self._audioDucker.disable()
-				self.ttsAudioStream.setState(SPAudioState.PAUSE, 0)
-			elif not switch and oldState == SPAudioState.PAUSE:
-				# unpausing
-				if self._audioDucker:
-					if audioDucking._isDebug():
-						log.debug("Enabling audio ducking due to setting output audio state to run")
-					self._audioDucker.enable()
-				self.ttsAudioStream.setState(SPAudioState.RUN, 0)
+		self.player.pause(switch)