nvaccess · seanbudd · Jan 10, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 8, 2025
@@ -9,6 +9,7 @@
 
 class SynthDriver(SynthDriver):
 	COM_CLASS = "speech.SPVoice"
+	CUSTOMSTREAM_COM_CLASS = "speech.SpCustomStream"
 
 	name = "mssp"
 	description = "Microsoft Speech Platform"
@@ -4,14 +4,16 @@
 # This file is covered by the GNU General Public License.
 # See the file COPYING for more details.
 
-from typing import Optional
+from ctypes import POINTER, c_wchar_p, cast, windll
 from enum import IntEnum
 import locale
 from collections import OrderedDict
+from comInterfaces.SpeechLib import ISpEventSource, ISpNotifySource, ISpNotifySink
 import comtypes.client
-from comtypes import COMError
+from comtypes import COMError, COMObject, IUnknown, hresult, ReturnHRESULT
 import winreg
-import audioDucking
+import nvwave
+from objidl import _ULARGE_INTEGER, IStream
 from synthDriverHandler import SynthDriver, VoiceInfo, synthIndexReached, synthDoneSpeaking
 import config
 from logHandler import log
@@ -53,41 +55,115 @@ class SpeechVoiceEvents(IntEnum):
 	Bookmark = 16
 
 
-class SapiSink(object):
-	"""Handles SAPI event notifications.
-	See https://msdn.microsoft.com/en-us/library/ms723587(v=vs.85).aspx
+class SynthDriverAudioStream(COMObject):
 	"""
+	Implements IStream to receive streamed-in audio data.
+	Should be wrapped in an SpCustomStream
+	(which also provides the wave format information),
+	then set as the AudioOutputStream.
+	"""
+
+	_com_interfaces_ = [IStream]
 
 	def __init__(self, synthRef: weakref.ReferenceType):
 		self.synthRef = synthRef
+		self._writtenBytes = 0
 
-	def StartStream(self, streamNum, pos):
+	def ISequentialStream_RemoteWrite(self, pv, cb):
+		# out: pcbWritten
 		synth = self.synthRef()
 		if synth is None:
-			log.debugWarning("Called StartStream method on SapiSink while driver is dead")
+			log.debugWarning("Called Write method on AudioStream while driver is dead")
+			return 0
+		if not synth.isSpeaking:
+			return 0
+		synth.player.feed(pv, cb)
+		self._writtenBytes += cb
+		return cb
+
+	def IStream_RemoteSeek(self, dlibMove, dwOrigin):
+		# out: plibNewPosition
+		if dwOrigin == 1 and dlibMove.QuadPart == 0:
+			# SAPI is querying the current position.
+			return _ULARGE_INTEGER(self._writtenBytes)
+		# Return E_NOTIMPL without logging an error.
+		raise ReturnHRESULT(hresult.E_NOTIMPL, None)
+
+	def IStream_Commit(self, grfCommitFlags):
+		# SAPI5 voices don't need this method, but MSSP voices do,
+		# which use this method to "flush" written data.
+		# Here we do nothing.
+		pass
+
+
+class SapiSink(COMObject):
+	"""
+	Implements ISpNotifySink to handle SAPI event notifications.
+	Should be passed to ISpNotifySource::SetNotifySink().
+	Notifications will be sent on the original thread,
+	instead of being routed to the main thread.
+	"""
+
+	_com_interfaces_ = [ISpNotifySink]
+
+	def __init__(self, synthRef: weakref.ReferenceType):
+		self.synthRef = synthRef
+
+	def ISpNotifySink_Notify(self):
+		synth = self.synthRef()
+		if synth is None:
+			log.debugWarning("Called Notify method on SapiSink while driver is dead")
 			return
-		if synth._audioDucker:
-			if audioDucking._isDebug():
-				log.debug("Enabling audio ducking due to starting speech stream")
-			synth._audioDucker.enable()
+		# Get all queued events
+		eventSource = synth.tts.QueryInterface(ISpEventSource)
+		while True:
+			# returned tuple: (event, numFetched)
+			eventTuple = eventSource.GetEvents(1)  # Get one event
+			if eventTuple[1] != 1:
+				break
+			event = eventTuple[0]
+			if event.eEventId == 1:  # SPEI_START_INPUT_STREAM
+				self.StartStream(event.ulStreamNum, event.ullAudioStreamOffset)
+			elif event.eEventId == 2:  # SPEI_END_INPUT_STREAM
+				self.EndStream(event.ulStreamNum, event.ullAudioStreamOffset)
+			elif event.eEventId == 4:  # SPEI_TTS_BOOKMARK
+				self.Bookmark(
+					event.ulStreamNum,
+					event.ullAudioStreamOffset,
+					cast(event.lParam, c_wchar_p).value,
+					event.wParam,
+				)
+			# free lParam
+			if event.elParamType == 1 or event.elParamType == 2:  # token or object
+				pUnk = cast(event.lParam, POINTER(IUnknown))
+				del pUnk
+			elif event.elParamType == 3 or event.elParamType == 4:  # pointer or string
+				windll.ole32.CoTaskMemFree(event.lParam)
+
+	def StartStream(self, streamNum, pos):
+		synth = self.synthRef()
+		synth.isSpeaking = True
 
 	def Bookmark(self, streamNum, pos, bookmark, bookmarkId):
 		synth = self.synthRef()
-		if synth is None:
-			log.debugWarning("Called Bookmark method on SapiSink while driver is dead")
+		if not synth.isSpeaking:
 			return
-		synthIndexReached.notify(synth=synth, index=bookmarkId)
+		# Bookmark event is raised before the audio after that point.
+		# Queue an IndexReached event at this point.
+		synth.player.feed(None, 0, lambda: self.onIndexReached(bookmarkId))
 
 	def EndStream(self, streamNum, pos):
+		synth = self.synthRef()
+		synth.isSpeaking = False
+		synth.player.idle()
+		synthDoneSpeaking.notify(synth=synth)
+
+	def onIndexReached(self, index):
 		synth = self.synthRef()
 		if synth is None:
-			log.debugWarning("Called Bookmark method on EndStream while driver is dead")
+			log.debugWarning("Called onIndexReached method on SapiSink while driver is dead")
 			return
-		synthDoneSpeaking.notify(synth=synth)
-		if synth._audioDucker:
-			if audioDucking._isDebug():
-				log.debug("Disabling audio ducking due to speech stream end")
-			synth._audioDucker.disable()
+		synthIndexReached.notify(synth=synth, index=index)
 
 
 class SynthDriver(SynthDriver):
@@ -110,6 +186,7 @@ class SynthDriver(SynthDriver):
 	supportedNotifications = {synthIndexReached, synthDoneSpeaking}
 
 	COM_CLASS = "SAPI.SPVoice"
+	CUSTOMSTREAM_COM_CLASS = "SAPI.SpCustomStream"
 
 	name = "sapi5"
 	description = "Microsoft Speech API version 5"
@@ -123,24 +200,21 @@ def check(cls):
 		except:  # noqa: E722
 			return False
 
-	ttsAudioStream = (
-		None  #: Holds the ISPAudio interface for the current voice, to aid in stopping and pausing audio
-	)
-	_audioDucker: Optional[audioDucking.AudioDucker] = None
-
 	def __init__(self, _defaultVoiceToken=None):
 		"""
 		@param _defaultVoiceToken: an optional sapi voice token which should be used as the default voice (only useful for subclasses)
 		@type _defaultVoiceToken: ISpeechObjectToken
 		"""
-		if audioDucking.isAudioDuckingSupported():
-			self._audioDucker = audioDucking.AudioDucker()
 		self._pitch = 50
+		self.player = None
+		self.isSpeaking = False
 		self._initTts(_defaultVoiceToken)
 
 	def terminate(self):
-		self._eventsConnection = None
 		self.tts = None
+		if self.player:
+			self.player.close()
+			self.player = None
 
 	def _getAvailableVoices(self):
 		voices = OrderedDict()
@@ -204,27 +278,31 @@ def _initTts(self, voice=None):
 			# Therefore, set the voice before setting the audio output.
 			# Otherwise, we will get poor speech quality in some cases.
 			self.tts.voice = voice
-		# SAPI5 automatically selects the system default audio device, so there's no use doing work if the user has selected to use the system default.
-		# Besides, our default value is not a valid endpoint ID.
-		if (outputDevice := config.conf["audio"]["outputDevice"]) != config.conf.getConfigValidation(
-			("audio", "outputDevice"),
-		).default:
-			for audioOutput in self.tts.GetAudioOutputs():
-				# SAPI's audio output IDs are registry keys. It seems that the final path segment is the endpoint ID.
-				if audioOutput.Id.endswith(outputDevice):
-					self.tts.audioOutput = audioOutput
-					break
-		self._eventsConnection = comtypes.client.GetEvents(self.tts, SapiSink(weakref.ref(self)))
+
+		self.tts.AudioOutput = self.tts.AudioOutput  # Reset the audio and its format parameters
+		fmt = self.tts.AudioOutputStream.Format
+		wfx = fmt.GetWaveFormatEx()
+		if self.player:
+			self.player.close()
+		self.player = nvwave.WavePlayer(
+			channels=wfx.Channels,
+			samplesPerSec=wfx.SamplesPerSec,
+			bitsPerSample=wfx.BitsPerSample,
+			outputDevice=config.conf["audio"]["outputDevice"],
+		)
+		audioStream = SynthDriverAudioStream(weakref.ref(self))
+		# Use SpCustomStream to wrap our IStream implementation and the correct wave format
+		customStream = comtypes.client.CreateObject(self.CUSTOMSTREAM_COM_CLASS)
+		customStream.BaseStream = audioStream
+		customStream.Format = fmt
+		self.tts.AudioOutputStream = customStream
+
+		# Set event notify sink
 		self.tts.EventInterests = (
 			SpeechVoiceEvents.StartInputStream | SpeechVoiceEvents.Bookmark | SpeechVoiceEvents.EndInputStream
 		)
-		from comInterfaces.SpeechLib import ISpAudio
-
-		try:
-			self.ttsAudioStream = self.tts.audioOutputStream.QueryInterface(ISpAudio)
-		except COMError:
-			log.debugWarning("SAPI5 voice does not support ISPAudio")
-			self.ttsAudioStream = None
+		notifySource = self.tts.QueryInterface(ISpNotifySource)
+		notifySource.SetNotifySink(SapiSink(weakref.ref(self)))
 
 	def _set_voice(self, value):
 		tokens = self._getVoiceTokens()
@@ -370,74 +448,14 @@ def outputTags():
 
 		text = "".join(textList)
 		flags = SpeechVoiceSpeakFlags.IsXML | SpeechVoiceSpeakFlags.Async
-		# Ducking should be complete before the synth starts producing audio.
-		# For this to happen, the speech method must block until ducking is complete.
-		# Ducking should be disabled when the synth is finished producing audio.
-		# Note that there may be calls to speak with a string that results in no audio,
-		# it is important that in this case the audio does not get stuck ducked.
-		# When there is no audio produced the startStream and endStream handlers are not called.
-		# To prevent audio getting stuck ducked, it is unducked at the end of speech.
-		# There are some known issues:
-		# - When there is no audio produced by the synth, a user may notice volume lowering (ducking) temporarily.
-		# - If the call to startStream handler is delayed significantly, users may notice a variation in volume
-		# (as ducking is disabled at the end of speak, and re-enabled when the startStream handler is called)
-
-		# A note on the synchronicity of components of this approach:
-		# SAPISink.StartStream event handler (callback):
-		# the synth speech is not blocked by this event callback.
-		# SAPISink.EndStream event handler (callback):
-		# assumed also to be async but not confirmed. Synchronicity is irrelevant to the current approach.
-		# AudioDucker.disable returns before the audio is completely unducked.
-		# AudioDucker.enable() ducking will complete before the function returns.
-		# It is not possible to "double duck the audio", calling twice yields the same result as calling once.
-		# AudioDucker class instances count the number of enables/disables,
-		# in order to unduck there must be no remaining enabled audio ducker instances.
-		# Due to this a temporary audio ducker is used around the call to speak.
-		# SAPISink.StartStream: Ducking here may allow the early speech to start before ducking is completed.
-		if audioDucking.isAudioDuckingSupported():
-			tempAudioDucker = audioDucking.AudioDucker()
-		else:
-			tempAudioDucker = None
-		if tempAudioDucker:
-			if audioDucking._isDebug():
-				log.debug("Enabling audio ducking due to speak call")
-			tempAudioDucker.enable()
-		try:
-			self.tts.Speak(text, flags)
-		finally:
-			if tempAudioDucker:
-				if audioDucking._isDebug():
-					log.debug("Disabling audio ducking  after speak call")
-				tempAudioDucker.disable()
+		self.tts.Speak(text, flags)
 
 	def cancel(self):
 		# SAPI5's default means of stopping speech can sometimes lag at end of speech, especially with Win8 / Win 10 Microsoft Voices.
-		# Therefore  instruct the underlying audio interface to stop first, before interupting and purging any remaining speech.
-		if self.ttsAudioStream:
-			self.ttsAudioStream.setState(SPAudioState.STOP, 0)
+		# Therefore  instruct the audio player to stop first, before interupting and purging any remaining speech.
+		self.isSpeaking = False
+		self.player.stop()
 		self.tts.Speak(None, SpeechVoiceSpeakFlags.Async | SpeechVoiceSpeakFlags.PurgeBeforeSpeak)
-		if self._audioDucker:
-			if audioDucking._isDebug():
-				log.debug("Disabling audio ducking due to setting output audio state to stop")
-			self._audioDucker.disable()
 
 	def pause(self, switch: bool):
-		# SAPI5's default means of pausing in most cases is either extremely slow
-		# (e.g. takes more than half a second) or does not work at all.
-		# Therefore instruct the underlying audio interface to pause instead.
-		if self.ttsAudioStream:
-			oldState = self.ttsAudioStream.GetStatus().State
-			if switch and oldState == SPAudioState.RUN:
-				# pausing
-				if self._audioDucker:
-					if audioDucking._isDebug():
-						log.debug("Disabling audio ducking due to setting output audio state to pause")
-					self._audioDucker.disable()
-				self.ttsAudioStream.setState(SPAudioState.PAUSE, 0)
-			elif not switch and oldState == SPAudioState.PAUSE:
-				# unpausing
-				if self._audioDucker:
-					if audioDucking._isDebug():
-						log.debug("Enabling audio ducking due to setting output audio state to run")
-					self._audioDucker.enable()
-				self.ttsAudioStream.setState(SPAudioState.RUN, 0)
+		self.player.pause(switch)
@@ -50,6 +50,7 @@ To use this feature, "allow NVDA to control the volume of other applications" mu
   * Updated CLDR to version 46.0. (#17484, @OzancanKaratas)
 * Short versions of the most commonly used command line options have been added: `-d` for `--disable-addons` and `-n` for `--lang`.
 Prefix matching on command line flags, e.g. using `--di` for `--disable-addons` is no longer supported. (#11644, @CyrilleB79)
+* Microsoft Speech API version 5 and Microsoft Speech Platform voices now use WASAPI for audio output, which may improve the responsiveness of those voices. (#13284, @gexgd0419)
 
 ### Bug Fixes