-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdictate.py
145 lines (131 loc) · 4.41 KB
/
dictate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import whisper
import sounddevice as sd
import numpy as np
import threading
import queue
import os
import time
import argparse
import sys
import Quartz
import AppKit # PyObjC library
import subprocess
# Command-line argument parsing
def parse_arguments():
parser = argparse.ArgumentParser(description="Whisper Dictation Tool")
parser.add_argument(
'--model',
type=str,
default='base',
choices=['tiny', 'base', 'small', 'medium', 'large'],
help='Choose the Whisper model size.'
)
return parser.parse_args()
# Global variables
audio_queue = queue.Queue()
result_queue = queue.Queue()
recording = False
transcribing = False
# Function to toggle recording
def toggle_recording():
global recording, transcribing
if not recording and not transcribing:
print("Recording started...")
recording = True
show_notification("Dictation", "Recording started")
elif recording:
print("Recording stopped.")
recording = False
show_notification("Dictation", "Recording stopped")
threading.Thread(target=transcribe_audio).start()
else:
print("Transcription in progress, please wait...")
# Function to show macOS notification
def show_notification(title, message):
os.system(f'''osascript -e 'display notification "{message}" with title "{title}"' ''')
# Audio callback function
def audio_callback(indata, frames, time_info, status):
if recording:
audio_queue.put(indata.copy())
# Transcription function
def transcribe_audio():
global transcribing
transcribing = True
audio_data = []
while not audio_queue.empty():
audio_data.append(audio_queue.get())
if audio_data:
audio = np.concatenate(audio_data, axis=0).flatten()
# Normalize audio
if np.max(np.abs(audio)) != 0:
audio = audio / np.max(np.abs(audio))
else:
print("No audio data to transcribe.")
transcribing = False
return
# Transcribe with Whisper
print("Transcribing...")
result = model.transcribe(audio, fp16=False)
text = result['text']
result_queue.put(text)
transcribing = False
# Send text to active application
while not result_queue.empty():
text = result_queue.get()
print("Transcribed Text:", text)
send_text_to_active_app(text)
# Function to send text to active application
def send_text_to_active_app(text):
# Set clipboard content safely
process = subprocess.Popen(['pbcopy'], stdin=subprocess.PIPE)
process.communicate(input=text.encode('utf-8'))
# Simulate Command+V to paste
os.system('osascript -e \'tell application "System Events" to keystroke "v" using {command down}\'')
# Event tap callback function
def tap_callback(proxy, type_, event, refcon):
keycode = Quartz.CGEventGetIntegerValueField(event, Quartz.kCGKeyboardEventKeycode)
# F1 keycode is 122
if keycode == 122:
toggle_recording()
return None # Suppress the event to prevent system beep
else:
return event
# Run the event tap in a separate thread
def run_event_tap():
event_mask = Quartz.CGEventMaskBit(Quartz.kCGEventKeyDown)
tap = Quartz.CGEventTapCreate(
Quartz.kCGSessionEventTap,
Quartz.kCGHeadInsertEventTap,
Quartz.kCGEventTapOptionDefault,
event_mask,
tap_callback,
None
)
if not tap:
print("Failed to create event tap. Please ensure the script has accessibility permissions.")
sys.exit(1)
run_loop_source = Quartz.CFMachPortCreateRunLoopSource(None, tap, 0)
Quartz.CFRunLoopAddSource(Quartz.CFRunLoopGetCurrent(), run_loop_source, Quartz.kCFRunLoopCommonModes)
Quartz.CGEventTapEnable(tap, True)
Quartz.CFRunLoopRun()
# Main execution
if __name__ == "__main__":
args = parse_arguments()
model_size = args.model
# Load the Whisper model
print(f"Loading Whisper model '{model_size}'...")
model = whisper.load_model(model_size)
# Set up audio stream
stream = sd.InputStream(callback=audio_callback, channels=1, samplerate=16000)
stream.start()
# Start the event tap thread
threading.Thread(target=run_event_tap, daemon=True).start()
# Keep the main thread alive
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
print("Exiting...")
stream.stop()
stream.close()
sys.exit(0)