Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make SAPI5 & MSSP voices use WavePlayer (WASAPI) #17592

Merged
merged 11 commits into from
Jan 10, 2025
1 change: 1 addition & 0 deletions source/synthDrivers/mssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

class SynthDriver(SynthDriver):
COM_CLASS = "speech.SPVoice"
CUSTOMSTREAM_COM_CLASS = "speech.SpCustomStream"

name = "mssp"
description = "Microsoft Speech Platform"
242 changes: 130 additions & 112 deletions source/synthDrivers/sapi5.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.

from typing import Optional
from ctypes import POINTER, c_wchar_p, cast, windll
from enum import IntEnum
import locale
from collections import OrderedDict
from comInterfaces.SpeechLib import ISpEventSource, ISpNotifySource, ISpNotifySink
import comtypes.client
from comtypes import COMError
from comtypes import COMError, COMObject, IUnknown, hresult, ReturnHRESULT
import winreg
import audioDucking
import nvwave
from objidl import _ULARGE_INTEGER, IStream
from synthDriverHandler import SynthDriver, VoiceInfo, synthIndexReached, synthDoneSpeaking
import config
from logHandler import log
Expand Down Expand Up @@ -53,41 +55,115 @@ class SpeechVoiceEvents(IntEnum):
Bookmark = 16


class SapiSink(object):
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
"""Handles SAPI event notifications.
See https://msdn.microsoft.com/en-us/library/ms723587(v=vs.85).aspx
class SynthDriverAudioStream(COMObject):
"""
Implements IStream to receive streamed-in audio data.
Should be wrapped in an SpCustomStream
(which also provides the wave format information),
then set as the AudioOutputStream.
"""

_com_interfaces_ = [IStream]

def __init__(self, synthRef: weakref.ReferenceType):
self.synthRef = synthRef
self._writtenBytes = 0

def StartStream(self, streamNum, pos):
def ISequentialStream_RemoteWrite(self, pv, cb):
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
# out: pcbWritten
gexgd0419 marked this conversation as resolved.
Show resolved Hide resolved
synth = self.synthRef()
if synth is None:
log.debugWarning("Called StartStream method on SapiSink while driver is dead")
log.debugWarning("Called Write method on AudioStream while driver is dead")
return 0
if not synth.isSpeaking:
return 0
synth.player.feed(pv, cb)
self._writtenBytes += cb
return cb

def IStream_RemoteSeek(self, dlibMove, dwOrigin):
gexgd0419 marked this conversation as resolved.
Show resolved Hide resolved
# out: plibNewPosition
gexgd0419 marked this conversation as resolved.
Show resolved Hide resolved
if dwOrigin == 1 and dlibMove.QuadPart == 0:
# SAPI is querying the current position.
return _ULARGE_INTEGER(self._writtenBytes)
# Return E_NOTIMPL without logging an error.
raise ReturnHRESULT(hresult.E_NOTIMPL, None)

def IStream_Commit(self, grfCommitFlags):
gexgd0419 marked this conversation as resolved.
Show resolved Hide resolved
# SAPI5 voices don't need this method, but MSSP voices do,
# which use this method to "flush" written data.
# Here we do nothing.
pass


class SapiSink(COMObject):
"""
Implements ISpNotifySink to handle SAPI event notifications.
Should be passed to ISpNotifySource::SetNotifySink().
Notifications will be sent on the original thread,
instead of being routed to the main thread.
"""

_com_interfaces_ = [ISpNotifySink]

def __init__(self, synthRef: weakref.ReferenceType):
self.synthRef = synthRef

def ISpNotifySink_Notify(self):
synth = self.synthRef()
if synth is None:
log.debugWarning("Called Notify method on SapiSink while driver is dead")
return
if synth._audioDucker:
if audioDucking._isDebug():
log.debug("Enabling audio ducking due to starting speech stream")
synth._audioDucker.enable()
# Get all queued events
eventSource = synth.tts.QueryInterface(ISpEventSource)
while True:
# returned tuple: (event, numFetched)
eventTuple = eventSource.GetEvents(1) # Get one event
if eventTuple[1] != 1:
break
event = eventTuple[0]
if event.eEventId == 1: # SPEI_START_INPUT_STREAM
self.StartStream(event.ulStreamNum, event.ullAudioStreamOffset)
elif event.eEventId == 2: # SPEI_END_INPUT_STREAM
self.EndStream(event.ulStreamNum, event.ullAudioStreamOffset)
elif event.eEventId == 4: # SPEI_TTS_BOOKMARK
self.Bookmark(
event.ulStreamNum,
event.ullAudioStreamOffset,
cast(event.lParam, c_wchar_p).value,
event.wParam,
)
# free lParam
if event.elParamType == 1 or event.elParamType == 2: # token or object
pUnk = cast(event.lParam, POINTER(IUnknown))
del pUnk
elif event.elParamType == 3 or event.elParamType == 4: # pointer or string
windll.ole32.CoTaskMemFree(event.lParam)

def StartStream(self, streamNum, pos):
gexgd0419 marked this conversation as resolved.
Show resolved Hide resolved
synth = self.synthRef()
synth.isSpeaking = True

def Bookmark(self, streamNum, pos, bookmark, bookmarkId):
gexgd0419 marked this conversation as resolved.
Show resolved Hide resolved
synth = self.synthRef()
if synth is None:
log.debugWarning("Called Bookmark method on SapiSink while driver is dead")
if not synth.isSpeaking:
return
synthIndexReached.notify(synth=synth, index=bookmarkId)
# Bookmark event is raised before the audio after that point.
# Queue an IndexReached event at this point.
synth.player.feed(None, 0, lambda: self.onIndexReached(bookmarkId))

def EndStream(self, streamNum, pos):
gexgd0419 marked this conversation as resolved.
Show resolved Hide resolved
synth = self.synthRef()
synth.isSpeaking = False
synth.player.idle()
synthDoneSpeaking.notify(synth=synth)

def onIndexReached(self, index):
gexgd0419 marked this conversation as resolved.
Show resolved Hide resolved
synth = self.synthRef()
if synth is None:
log.debugWarning("Called Bookmark method on EndStream while driver is dead")
log.debugWarning("Called onIndexReached method on SapiSink while driver is dead")
return
synthDoneSpeaking.notify(synth=synth)
if synth._audioDucker:
if audioDucking._isDebug():
log.debug("Disabling audio ducking due to speech stream end")
synth._audioDucker.disable()
synthIndexReached.notify(synth=synth, index=index)


class SynthDriver(SynthDriver):
Expand All @@ -110,6 +186,7 @@ class SynthDriver(SynthDriver):
supportedNotifications = {synthIndexReached, synthDoneSpeaking}

COM_CLASS = "SAPI.SPVoice"
CUSTOMSTREAM_COM_CLASS = "SAPI.SpCustomStream"

name = "sapi5"
description = "Microsoft Speech API version 5"
Expand All @@ -123,24 +200,21 @@ def check(cls):
except: # noqa: E722
return False

ttsAudioStream = (
gexgd0419 marked this conversation as resolved.
Show resolved Hide resolved
None #: Holds the ISPAudio interface for the current voice, to aid in stopping and pausing audio
)
_audioDucker: Optional[audioDucking.AudioDucker] = None

def __init__(self, _defaultVoiceToken=None):
"""
@param _defaultVoiceToken: an optional sapi voice token which should be used as the default voice (only useful for subclasses)
@type _defaultVoiceToken: ISpeechObjectToken
"""
if audioDucking.isAudioDuckingSupported():
self._audioDucker = audioDucking.AudioDucker()
self._pitch = 50
self.player = None
self.isSpeaking = False
self._initTts(_defaultVoiceToken)

def terminate(self):
self._eventsConnection = None
self.tts = None
if self.player:
self.player.close()
self.player = None

def _getAvailableVoices(self):
voices = OrderedDict()
Expand Down Expand Up @@ -204,27 +278,31 @@ def _initTts(self, voice=None):
# Therefore, set the voice before setting the audio output.
# Otherwise, we will get poor speech quality in some cases.
self.tts.voice = voice
# SAPI5 automatically selects the system default audio device, so there's no use doing work if the user has selected to use the system default.
# Besides, our default value is not a valid endpoint ID.
if (outputDevice := config.conf["audio"]["outputDevice"]) != config.conf.getConfigValidation(
("audio", "outputDevice"),
).default:
for audioOutput in self.tts.GetAudioOutputs():
# SAPI's audio output IDs are registry keys. It seems that the final path segment is the endpoint ID.
if audioOutput.Id.endswith(outputDevice):
self.tts.audioOutput = audioOutput
break
self._eventsConnection = comtypes.client.GetEvents(self.tts, SapiSink(weakref.ref(self)))

self.tts.AudioOutput = self.tts.AudioOutput # Reset the audio and its format parameters
fmt = self.tts.AudioOutputStream.Format
wfx = fmt.GetWaveFormatEx()
if self.player:
self.player.close()
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
self.player = nvwave.WavePlayer(
channels=wfx.Channels,
samplesPerSec=wfx.SamplesPerSec,
bitsPerSample=wfx.BitsPerSample,
outputDevice=config.conf["audio"]["outputDevice"],
)
audioStream = SynthDriverAudioStream(weakref.ref(self))
# Use SpCustomStream to wrap our IStream implementation and the correct wave format
customStream = comtypes.client.CreateObject(self.CUSTOMSTREAM_COM_CLASS)
customStream.BaseStream = audioStream
customStream.Format = fmt
self.tts.AudioOutputStream = customStream

# Set event notify sink
self.tts.EventInterests = (
SpeechVoiceEvents.StartInputStream | SpeechVoiceEvents.Bookmark | SpeechVoiceEvents.EndInputStream
)
from comInterfaces.SpeechLib import ISpAudio

try:
self.ttsAudioStream = self.tts.audioOutputStream.QueryInterface(ISpAudio)
except COMError:
log.debugWarning("SAPI5 voice does not support ISPAudio")
self.ttsAudioStream = None
notifySource = self.tts.QueryInterface(ISpNotifySource)
notifySource.SetNotifySink(SapiSink(weakref.ref(self)))

def _set_voice(self, value):
tokens = self._getVoiceTokens()
Expand Down Expand Up @@ -370,74 +448,14 @@ def outputTags():

text = "".join(textList)
flags = SpeechVoiceSpeakFlags.IsXML | SpeechVoiceSpeakFlags.Async
# Ducking should be complete before the synth starts producing audio.
# For this to happen, the speech method must block until ducking is complete.
# Ducking should be disabled when the synth is finished producing audio.
# Note that there may be calls to speak with a string that results in no audio,
# it is important that in this case the audio does not get stuck ducked.
# When there is no audio produced the startStream and endStream handlers are not called.
# To prevent audio getting stuck ducked, it is unducked at the end of speech.
# There are some known issues:
# - When there is no audio produced by the synth, a user may notice volume lowering (ducking) temporarily.
# - If the call to startStream handler is delayed significantly, users may notice a variation in volume
# (as ducking is disabled at the end of speak, and re-enabled when the startStream handler is called)

# A note on the synchronicity of components of this approach:
# SAPISink.StartStream event handler (callback):
# the synth speech is not blocked by this event callback.
# SAPISink.EndStream event handler (callback):
# assumed also to be async but not confirmed. Synchronicity is irrelevant to the current approach.
# AudioDucker.disable returns before the audio is completely unducked.
# AudioDucker.enable() ducking will complete before the function returns.
# It is not possible to "double duck the audio", calling twice yields the same result as calling once.
# AudioDucker class instances count the number of enables/disables,
# in order to unduck there must be no remaining enabled audio ducker instances.
# Due to this a temporary audio ducker is used around the call to speak.
# SAPISink.StartStream: Ducking here may allow the early speech to start before ducking is completed.
if audioDucking.isAudioDuckingSupported():
tempAudioDucker = audioDucking.AudioDucker()
else:
tempAudioDucker = None
if tempAudioDucker:
if audioDucking._isDebug():
log.debug("Enabling audio ducking due to speak call")
tempAudioDucker.enable()
try:
self.tts.Speak(text, flags)
finally:
if tempAudioDucker:
if audioDucking._isDebug():
log.debug("Disabling audio ducking after speak call")
tempAudioDucker.disable()
self.tts.Speak(text, flags)

def cancel(self):
# SAPI5's default means of stopping speech can sometimes lag at end of speech, especially with Win8 / Win 10 Microsoft Voices.
# Therefore instruct the underlying audio interface to stop first, before interupting and purging any remaining speech.
if self.ttsAudioStream:
self.ttsAudioStream.setState(SPAudioState.STOP, 0)
# Therefore instruct the audio player to stop first, before interupting and purging any remaining speech.
self.isSpeaking = False
self.player.stop()
self.tts.Speak(None, SpeechVoiceSpeakFlags.Async | SpeechVoiceSpeakFlags.PurgeBeforeSpeak)
if self._audioDucker:
if audioDucking._isDebug():
log.debug("Disabling audio ducking due to setting output audio state to stop")
self._audioDucker.disable()

def pause(self, switch: bool):
# SAPI5's default means of pausing in most cases is either extremely slow
# (e.g. takes more than half a second) or does not work at all.
# Therefore instruct the underlying audio interface to pause instead.
if self.ttsAudioStream:
oldState = self.ttsAudioStream.GetStatus().State
if switch and oldState == SPAudioState.RUN:
gexgd0419 marked this conversation as resolved.
Show resolved Hide resolved
# pausing
if self._audioDucker:
if audioDucking._isDebug():
log.debug("Disabling audio ducking due to setting output audio state to pause")
self._audioDucker.disable()
self.ttsAudioStream.setState(SPAudioState.PAUSE, 0)
elif not switch and oldState == SPAudioState.PAUSE:
# unpausing
if self._audioDucker:
if audioDucking._isDebug():
log.debug("Enabling audio ducking due to setting output audio state to run")
self._audioDucker.enable()
self.ttsAudioStream.setState(SPAudioState.RUN, 0)
self.player.pause(switch)
1 change: 1 addition & 0 deletions user_docs/en/changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ To use this feature, "allow NVDA to control the volume of other applications" mu
* Updated CLDR to version 46.0. (#17484, @OzancanKaratas)
* Short versions of the most commonly used command line options have been added: `-d` for `--disable-addons` and `-n` for `--lang`.
Prefix matching on command line flags, e.g. using `--di` for `--disable-addons` is no longer supported. (#11644, @CyrilleB79)
* Microsoft Speech API version 5 and Microsoft Speech Platform voices now use WASAPI for audio output, which may improve the responsiveness of those voices. (#13284, @gexgd0419)

### Bug Fixes

Expand Down