-
Notifications
You must be signed in to change notification settings - Fork 0
/
jobbie_gpt_CLIENT_05-10-2024.py
264 lines (211 loc) · 9.32 KB
/
jobbie_gpt_CLIENT_05-10-2024.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import pyaudio
import wave
import webrtcvad
import collections
import datetime
import audioop
import threading
import subprocess
import os
from colorama import Fore, Style, init
import json
import curses
import socket
import sys
# Instance manager for whisper
transcription_lock = threading.Lock()
# Pause recording trigger manager
is_paused = threading.Event()
is_paused.clear() # Set this to true to pause the recording
# Initialize colorama
init(autoreset=True)
# Initialize VAD
def create_vad(aggressiveness=1):
vad = webrtcvad.Vad(aggressiveness)
return vad
# Calculate RMS level
def rms_level(data):
rms = audioop.rms(data, 2) # width=2 for format=pyaudio.paInt16
return rms
def log_transcription(json_data, filename):
with open(filename, 'w') as f:
json.dump(json_data, f)
def display_transcription_as_html(json_data):
html_content = '<html><body>'
for entry in json_data:
html_content += f"<p>{entry['timestamp']} - {entry['text']}</p>"
html_content += '</body></html>'
return html_content
def transcribe_and_log(filename, start_timestamp):
with transcription_lock:
# Ensure the command uses properly quoted paths
cmd = f"whisper-ctranslate2 --vad_filter True --model medium.en --language en --output_dir \"{os.path.dirname(filename)}\" --device cpu \"{filename}\""
try:
# Run the command and capture output
result = subprocess.run(cmd, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print("Output:", result.stdout)
except subprocess.CalledProcessError as e:
# Print error if the command fails
print("Error:", e.stderr)
return
# Assuming the transcription tool outputs a .txt file in the same directory
txt_filename = f"{filename[:-4]}.txt" # Replaces .wav with .txt
if os.path.exists(txt_filename):
with open(txt_filename, 'r') as file:
transcription = file.read()
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_entry = f"{timestamp} - {transcription}\n"
# Append to log file
log_file_path = os.path.join(os.path.dirname(filename), "transcriptions.log")
with open(log_file_path, "a") as log_file:
log_file.write(log_entry)
# Print in another console or window (simulated here)
print(Fore.CYAN + f"Transcription ({timestamp}): {transcription}")
#adjusted max silence ms to be in line with countdown_timer
def continuous_record(device_id, format=pyaudio.paInt16, channels=1, rate=16000, chunk_duration_ms=30, max_silence_ms=5000):
chunk_size = int(rate * chunk_duration_ms / 1000)
vad = create_vad()
p = pyaudio.PyAudio()
stream = p.open(format=format, channels=channels, rate=rate, input=True, input_device_index=device_id, frames_per_buffer=chunk_size)
print("Recording started. Speak into the microphone.")
audio_data = collections.deque()
countdown_timer = 5.0 # Countdown timer in seconds
voice_detected_for = 0.0 # Initialize the duration of voice presence
last_time_checked = datetime.datetime.now()
v_det = False # Move outside the loop to maintain state
try:
while True:
frame = stream.read(chunk_size)
is_speech = vad.is_speech(frame, rate)
now = datetime.datetime.now()
audio_data.append(frame)
# Calculate RMS and decide color
level = rms_level(frame)
level_color = 'RED' if level < 100 else 'GREEN'
print(f"{level_color} RMS Level: {level}", end=' ')
if is_speech:
voice_detected_for += (now - last_time_checked).total_seconds()
print("Voice detected", end=' \r')
if voice_detected_for >= 0.5:
v_det = True # Voice detected long enough to set v_det true
countdown_timer = 5.0
else:
voice_detected_for = 0.0
# Reduce countdown timer
countdown_timer -= (now - last_time_checked).total_seconds()
last_time_checked = now
if countdown_timer <= 0:
if v_det:
print(f"\nSAVED: Timer expired at: {now.strftime('%Y-%m-%d %H:%M:%S')}")
filename = save_audio(list(audio_data), p, format, channels, rate)
audio_data.clear()
threading.Thread(target=transcribe_and_log, args=(filename,)).start()
countdown_timer = 5.0 # Reset the countdown after processing
v_det = False # Reset v_det after handling
# change countdown timer here to 60s later so we get 1m+ recordings?
else:
print(f"\nRecording discarded - no voice detected. Timer expired at: {now.strftime('%Y-%m-%d %H:%M:%S')}")
audio_data.clear()
countdown_timer = 5.0
#countdown_timer = 15.0 # Reset the countdown after processing
#v_det = False # Reset v_det after handling
else:
print(f"Timer: {countdown_timer:.2f}s remaining", end=' \r')
finally:
stream.stop_stream()
stream.close()
p.terminate()
print("\nRecording stopped.")
# Save the audio data to a WAV file in Y:\__MEDIA\__Transcribing and Recording\2024\Dad Auto Transcriber3
def save_audio(audio_frames, pyaudio_instance, format, channels, rate):
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
directory = r"Y:\__MEDIA\__Transcribing and Recording\2024\Dad Auto Transcriber" # Use a raw string for the path
if not os.path.exists(directory):
os.makedirs(directory, exist_ok=True) # Create the directory if it does not exist
filename = f"{timestamp}.wav"
wf = wave.open(filename, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(pyaudio_instance.get_sample_size(format))
wf.setframerate(rate)
wf.writeframes(b''.join(audio_frames))
wf.close()
print(f"\nAudio saved as {filename}")
return filename, timestamp
def send_audio(file_path):
host = 'server_address' # IP address of the server
port = 12345
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.connect((host, port))
# Send the path of the audio file to the server
sock.sendall(file_path.encode())
# Wait for the server to send back the transcribed text
data = sock.recv(1024)
print("Received:", data.decode())
def convert_to_flac(wav_filename):
flac_filename = wav_filename.replace('.wav', '.flac')
subprocess.run(['ffmpeg', '-i', wav_filename, '-c:a', 'flac', '-compression_level', '12', flac_filename])
return flac_idlename
# Initialize PyAudio
pSys = pyaudio.PyAudio()
# Initialize microphone PyAudio - duplicate of __main__
p = pyaudio.PyAudio()
# Function to capture system audio
def capture_system_audio(stream, stop_event):
while not stop_event.is_set():
data = stream.read(1024)
rms = audioop.rms(data, 2)
if rms > 100: # Threshold for 'significant audio'
print("Significant system audio detected")
stop_event.set() # Signal to pause microphone recording
# Function to capture microphone audio
def capture_microphone_audio(stream, stop_event):
while not stop_event.is_set():
data = stream.read(1024)
print("Recording microphone...")
# Here you would add your processing logic
# GPT proposed queue management
def setup_and_run():
# Setup audio streams
system_stream = p.open(format=pyaudio.paInt16, channels=2, rate=44100, input=True,
input_device_index=DEVICE_INDEX_FOR_SYSTEM, # Set appropriately
frames_per_buffer=1024, as_loopback=True)
mic_stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True,
input_device_index=DEVICE_INDEX_FOR_MIC, # Set appropriately
frames_per_buffer=1024)
stop_event = threading.Event()
# Start threads
system_thread = threading.Thread(target=capture_system_audio, args=(system_stream, stop_event))
mic_thread = threading.Thread(target=capture_microphone_audio, args=(mic_stream, stop_event))
system_thread.start()
mic_thread.start()
# Join threads
system_thread.join()
mic_thread.join()
# Cleanup
system_stream.stop_stream()
system_stream.close()
mic_stream.stop_stream()
mic_stream.close()
p.terminate()
#if __name__ == "__main__":
# setup_and_run()
if __name__ == "__main__":
if len(sys.argv) > 1:
audio_file_path = sys.argv[1]
send_audio(audio_file_path)
else:
print("Usage: python client.py <audio_file_path>")
setup_and_run()
# Main function
def main():
print("Available recording devices:")
p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
dev = p.get_device_info_by_index(i)
if dev['maxInputChannels'] > 0:
print(f"Device ID {i}: {dev['name']}")
p.terminate()
#device_id = int(input("Enter the device ID you want to use: "))
continuous_record(0)
if __name__ == "__main__":
main()