Skip to content

Commit

Permalink
Merge branch 'master' into fix-sapi5-continuous-reading
Browse files Browse the repository at this point in the history
  • Loading branch information
gexgd0419 committed Jan 10, 2025
2 parents 65387a8 + 631156c commit 23fae4e
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 132 deletions.
1 change: 1 addition & 0 deletions source/synthDrivers/mssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

class SynthDriver(SynthDriver):
COM_CLASS = "speech.SPVoice"
CUSTOMSTREAM_COM_CLASS = "speech.SpCustomStream"

name = "mssp"
description = "Microsoft Speech Platform"
293 changes: 161 additions & 132 deletions source/synthDrivers/sapi5.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.

from typing import Optional
from ctypes import POINTER, c_ubyte, c_wchar_p, cast, windll, _Pointer
from enum import IntEnum
import locale
from collections import OrderedDict, deque
from typing import TYPE_CHECKING
from comInterfaces.SpeechLib import ISpEventSource, ISpNotifySource, ISpNotifySink
import comtypes.client
from comtypes import COMError
from comtypes import COMError, COMObject, IUnknown, hresult, ReturnHRESULT
import winreg
import audioDucking
import nvwave
from objidl import _LARGE_INTEGER, _ULARGE_INTEGER, IStream
from synthDriverHandler import SynthDriver, VoiceInfo, synthIndexReached, synthDoneSpeaking
import config
from logHandler import log
Expand All @@ -31,14 +34,6 @@
)


class SPAudioState(IntEnum):
# https://docs.microsoft.com/en-us/previous-versions/windows/desktop/ms720596(v=vs.85)
CLOSED = 0
STOP = 1
PAUSE = 2
RUN = 3


class SpeechVoiceSpeakFlags(IntEnum):
# https://docs.microsoft.com/en-us/previous-versions/windows/desktop/ms720892(v=vs.85)
Async = 1
Expand All @@ -53,56 +48,148 @@ class SpeechVoiceEvents(IntEnum):
Bookmark = 16


class SapiSink(object):
"""Handles SAPI event notifications.
See https://msdn.microsoft.com/en-us/library/ms723587(v=vs.85).aspx
if TYPE_CHECKING:
LP_c_ubyte = _Pointer[c_ubyte]
else:
LP_c_ubyte = POINTER(c_ubyte)


class SynthDriverAudioStream(COMObject):
"""
Implements IStream to receive streamed-in audio data.
Should be wrapped in an SpCustomStream
(which also provides the wave format information),
then set as the AudioOutputStream.
"""

_com_interfaces_ = [IStream]

def __init__(self, synthRef: weakref.ReferenceType):
self.synthRef = synthRef
self._writtenBytes = 0

def StartStream(self, streamNum, pos):
def ISequentialStream_RemoteWrite(self, pv: LP_c_ubyte, cb: int) -> int:
"""This is called when SAPI wants to write (output) a wave data chunk.
:param pv: A pointer to the first wave data byte.
:param cb: The number of bytes to write.
:returns: The number of bytes written.
"""
synth = self.synthRef()
if synth is None:
log.debugWarning("Called StartStream method on SapiSink while driver is dead")
log.debugWarning("Called Write method on AudioStream while driver is dead")
return 0
if not synth.isSpeaking:
return 0
synth.player.feed(pv, cb)
self._writtenBytes += cb
return cb

def IStream_RemoteSeek(self, dlibMove: _LARGE_INTEGER, dwOrigin: int) -> _ULARGE_INTEGER:
"""This is called when SAPI wants to get the current stream position.
Seeking to another position is not supported.
:param dlibMove: The displacement to be added to the location indicated by the dwOrigin parameter.
Only 0 is supported.
:param dwOrigin: The origin for the displacement specified in dlibMove.
Only 1 (STREAM_SEEK_CUR) is supported.
:returns: The current stream position.
"""
if dwOrigin == 1 and dlibMove.QuadPart == 0:
# SAPI is querying the current position.
return _ULARGE_INTEGER(self._writtenBytes)
# Return E_NOTIMPL without logging an error.
raise ReturnHRESULT(hresult.E_NOTIMPL, None)

def IStream_Commit(self, grfCommitFlags: int):
"""This is called when MSSP wants to flush the written data.
Does nothing."""
pass


class SapiSink(COMObject):
"""
Implements ISpNotifySink to handle SAPI event notifications.
Should be passed to ISpNotifySource::SetNotifySink().
Notifications will be sent on the original thread,
instead of being routed to the main thread.
"""

_com_interfaces_ = [ISpNotifySink]

def __init__(self, synthRef: weakref.ReferenceType):
self.synthRef = synthRef

def ISpNotifySink_Notify(self):
"""This is called when there's a new event notification.
Queued events will be retrieved."""
synth = self.synthRef()
if synth is None:
log.debugWarning("Called Notify method on SapiSink while driver is dead")
return
# Get all queued events
eventSource = synth.tts.QueryInterface(ISpEventSource)
while True:
# returned tuple: (event, numFetched)
eventTuple = eventSource.GetEvents(1) # Get one event
if eventTuple[1] != 1:
break
event = eventTuple[0]
if event.eEventId == 1: # SPEI_START_INPUT_STREAM
self.StartStream(event.ulStreamNum, event.ullAudioStreamOffset)
elif event.eEventId == 2: # SPEI_END_INPUT_STREAM
self.EndStream(event.ulStreamNum, event.ullAudioStreamOffset)
elif event.eEventId == 4: # SPEI_TTS_BOOKMARK
self.Bookmark(
event.ulStreamNum,
event.ullAudioStreamOffset,
cast(event.lParam, c_wchar_p).value,
event.wParam,
)
# free lParam
if event.elParamType == 1 or event.elParamType == 2: # token or object
pUnk = cast(event.lParam, POINTER(IUnknown))
del pUnk
elif event.elParamType == 3 or event.elParamType == 4: # pointer or string
windll.ole32.CoTaskMemFree(event.lParam)

def StartStream(self, streamNum: int, pos: int):
synth = self.synthRef()
# The stream has been started. Move the bookmark list to _streamBookmarks.
if streamNum in synth._streamBookmarksNew:
synth._streamBookmarks[streamNum] = synth._streamBookmarksNew[streamNum]
del synth._streamBookmarksNew[streamNum]
if synth._audioDucker:
if audioDucking._isDebug():
log.debug("Enabling audio ducking due to starting speech stream")
synth._audioDucker.enable()
synth.isSpeaking = True

def Bookmark(self, streamNum, pos, bookmark, bookmarkId):
def Bookmark(self, streamNum: int, pos: int, bookmark: str, bookmarkId: int):
synth = self.synthRef()
if synth is None:
log.debugWarning("Called Bookmark method on SapiSink while driver is dead")
if not synth.isSpeaking:
return
synthIndexReached.notify(synth=synth, index=bookmarkId)
# remove already triggered bookmarks
if streamNum in synth._streamBookmarks:
bookmarks = synth._streamBookmarks[streamNum]
while bookmarks:
if bookmarks.popleft() == bookmarkId:
break
# Bookmark event is raised before the audio after that point.
# Queue an IndexReached event at this point.
synth.player.feed(None, 0, lambda: self.onIndexReached(streamNum, bookmarkId))

def EndStream(self, streamNum, pos):
def EndStream(self, streamNum: int, pos: int):
synth = self.synthRef()
if synth is None:
log.debugWarning("Called Bookmark method on EndStream while driver is dead")
return
# trigger all untriggered bookmarks
if streamNum in synth._streamBookmarks:
for bookmark in synth._streamBookmarks[streamNum]:
synthIndexReached.notify(synth=synth, index=bookmark)
del synth._streamBookmarks[streamNum]
synth.isSpeaking = False
synth.player.idle()
synthDoneSpeaking.notify(synth=synth)
if synth._audioDucker:
if audioDucking._isDebug():
log.debug("Disabling audio ducking due to speech stream end")
synth._audioDucker.disable()

def onIndexReached(self, streamNum: int, index: int):
synth = self.synthRef()
if synth is None:
log.debugWarning("Called onIndexReached method on SapiSink while driver is dead")
return
synthIndexReached.notify(synth=synth, index=index)
# remove already triggered bookmarks
if streamNum in synth._streamBookmarks:
bookmarks = synth._streamBookmarks[streamNum]
while bookmarks:
if bookmarks.popleft() == index:
break


class SynthDriver(SynthDriver):
Expand All @@ -125,6 +212,7 @@ class SynthDriver(SynthDriver):
supportedNotifications = {synthIndexReached, synthDoneSpeaking}

COM_CLASS = "SAPI.SPVoice"
CUSTOMSTREAM_COM_CLASS = "SAPI.SpCustomStream"

name = "sapi5"
description = "Microsoft Speech API version 5"
Expand All @@ -138,27 +226,24 @@ def check(cls):
except: # noqa: E722
return False

ttsAudioStream = (
None #: Holds the ISPAudio interface for the current voice, to aid in stopping and pausing audio
)
_audioDucker: Optional[audioDucking.AudioDucker] = None

def __init__(self, _defaultVoiceToken=None):
"""
@param _defaultVoiceToken: an optional sapi voice token which should be used as the default voice (only useful for subclasses)
@type _defaultVoiceToken: ISpeechObjectToken
"""
if audioDucking.isAudioDuckingSupported():
self._audioDucker = audioDucking.AudioDucker()
self._pitch = 50
self.player = None
self.isSpeaking = False
self._initTts(_defaultVoiceToken)
# key = stream num, value = deque of bookmarks
self._streamBookmarks = dict() # bookmarks in currently speaking streams
self._streamBookmarksNew = dict() # bookmarks for streams that haven't been started

def terminate(self):
self._eventsConnection = None
self.tts = None
if self.player:
self.player.close()
self.player = None

def _getAvailableVoices(self):
voices = OrderedDict()
Expand Down Expand Up @@ -222,27 +307,31 @@ def _initTts(self, voice=None):
# Therefore, set the voice before setting the audio output.
# Otherwise, we will get poor speech quality in some cases.
self.tts.voice = voice
# SAPI5 automatically selects the system default audio device, so there's no use doing work if the user has selected to use the system default.
# Besides, our default value is not a valid endpoint ID.
if (outputDevice := config.conf["audio"]["outputDevice"]) != config.conf.getConfigValidation(
("audio", "outputDevice"),
).default:
for audioOutput in self.tts.GetAudioOutputs():
# SAPI's audio output IDs are registry keys. It seems that the final path segment is the endpoint ID.
if audioOutput.Id.endswith(outputDevice):
self.tts.audioOutput = audioOutput
break
self._eventsConnection = comtypes.client.GetEvents(self.tts, SapiSink(weakref.ref(self)))

self.tts.AudioOutput = self.tts.AudioOutput # Reset the audio and its format parameters
fmt = self.tts.AudioOutputStream.Format
wfx = fmt.GetWaveFormatEx()
if self.player:
self.player.close()
self.player = nvwave.WavePlayer(
channels=wfx.Channels,
samplesPerSec=wfx.SamplesPerSec,
bitsPerSample=wfx.BitsPerSample,
outputDevice=config.conf["audio"]["outputDevice"],
)
audioStream = SynthDriverAudioStream(weakref.ref(self))
# Use SpCustomStream to wrap our IStream implementation and the correct wave format
customStream = comtypes.client.CreateObject(self.CUSTOMSTREAM_COM_CLASS)
customStream.BaseStream = audioStream
customStream.Format = fmt
self.tts.AudioOutputStream = customStream

# Set event notify sink
self.tts.EventInterests = (
SpeechVoiceEvents.StartInputStream | SpeechVoiceEvents.Bookmark | SpeechVoiceEvents.EndInputStream
)
from comInterfaces.SpeechLib import ISpAudio

try:
self.ttsAudioStream = self.tts.audioOutputStream.QueryInterface(ISpAudio)
except COMError:
log.debugWarning("SAPI5 voice does not support ISPAudio")
self.ttsAudioStream = None
notifySource = self.tts.QueryInterface(ISpNotifySource)
notifySource.SetNotifySink(SapiSink(weakref.ref(self)))

def _set_voice(self, value):
tokens = self._getVoiceTokens()
Expand Down Expand Up @@ -390,77 +479,17 @@ def outputTags():

text = "".join(textList)
flags = SpeechVoiceSpeakFlags.IsXML | SpeechVoiceSpeakFlags.Async
# Ducking should be complete before the synth starts producing audio.
# For this to happen, the speech method must block until ducking is complete.
# Ducking should be disabled when the synth is finished producing audio.
# Note that there may be calls to speak with a string that results in no audio,
# it is important that in this case the audio does not get stuck ducked.
# When there is no audio produced the startStream and endStream handlers are not called.
# To prevent audio getting stuck ducked, it is unducked at the end of speech.
# There are some known issues:
# - When there is no audio produced by the synth, a user may notice volume lowering (ducking) temporarily.
# - If the call to startStream handler is delayed significantly, users may notice a variation in volume
# (as ducking is disabled at the end of speak, and re-enabled when the startStream handler is called)

# A note on the synchronicity of components of this approach:
# SAPISink.StartStream event handler (callback):
# the synth speech is not blocked by this event callback.
# SAPISink.EndStream event handler (callback):
# assumed also to be async but not confirmed. Synchronicity is irrelevant to the current approach.
# AudioDucker.disable returns before the audio is completely unducked.
# AudioDucker.enable() ducking will complete before the function returns.
# It is not possible to "double duck the audio", calling twice yields the same result as calling once.
# AudioDucker class instances count the number of enables/disables,
# in order to unduck there must be no remaining enabled audio ducker instances.
# Due to this a temporary audio ducker is used around the call to speak.
# SAPISink.StartStream: Ducking here may allow the early speech to start before ducking is completed.
if audioDucking.isAudioDuckingSupported():
tempAudioDucker = audioDucking.AudioDucker()
else:
tempAudioDucker = None
if tempAudioDucker:
if audioDucking._isDebug():
log.debug("Enabling audio ducking due to speak call")
tempAudioDucker.enable()
try:
streamNum = self.tts.Speak(text, flags)
# When Speak returns, the previous stream may not have been ended.
# So the bookmark list is stored in another dict until this stream starts.
self._streamBookmarksNew[streamNum] = bookmarks
finally:
if tempAudioDucker:
if audioDucking._isDebug():
log.debug("Disabling audio ducking after speak call")
tempAudioDucker.disable()
streamNum = self.tts.Speak(text, flags)
# When Speak returns, the previous stream may not have been ended.
# So the bookmark list is stored in another dict until this stream starts.
self._streamBookmarksNew[streamNum] = bookmarks

def cancel(self):
# SAPI5's default means of stopping speech can sometimes lag at end of speech, especially with Win8 / Win 10 Microsoft Voices.
# Therefore instruct the underlying audio interface to stop first, before interupting and purging any remaining speech.
if self.ttsAudioStream:
self.ttsAudioStream.setState(SPAudioState.STOP, 0)
# Therefore instruct the audio player to stop first, before interupting and purging any remaining speech.
self.isSpeaking = False
self.player.stop()
self.tts.Speak(None, SpeechVoiceSpeakFlags.Async | SpeechVoiceSpeakFlags.PurgeBeforeSpeak)
if self._audioDucker:
if audioDucking._isDebug():
log.debug("Disabling audio ducking due to setting output audio state to stop")
self._audioDucker.disable()

def pause(self, switch: bool):
# SAPI5's default means of pausing in most cases is either extremely slow
# (e.g. takes more than half a second) or does not work at all.
# Therefore instruct the underlying audio interface to pause instead.
if self.ttsAudioStream:
oldState = self.ttsAudioStream.GetStatus().State
if switch and oldState == SPAudioState.RUN:
# pausing
if self._audioDucker:
if audioDucking._isDebug():
log.debug("Disabling audio ducking due to setting output audio state to pause")
self._audioDucker.disable()
self.ttsAudioStream.setState(SPAudioState.PAUSE, 0)
elif not switch and oldState == SPAudioState.PAUSE:
# unpausing
if self._audioDucker:
if audioDucking._isDebug():
log.debug("Enabling audio ducking due to setting output audio state to run")
self._audioDucker.enable()
self.ttsAudioStream.setState(SPAudioState.RUN, 0)
self.player.pause(switch)
Loading

0 comments on commit 23fae4e

Please sign in to comment.