This commit is contained in:
HuangHai
2026-01-31 16:31:22 +08:00
parent b20d15c03e
commit 0597d03a6a
10 changed files with 1360 additions and 8 deletions

View File

@@ -27,19 +27,29 @@ DEFAULT_AUDIO_MIME_TYPE = "audio/mpeg"
class MyCallback(QwenTtsRealtimeCallback):
"""
自定义 TTS 流式回调
自定义 TTS 流式回调,增加了播放缓冲区以解决语音断续问题
"""
def __init__(self):
def __init__(self, buffer_seconds=0.5):
self.complete_event = threading.Event()
self.playback_start_event = threading.Event() # 新增:播放开始事件
self._player = pyaudio.PyAudio()
self._stream = self._player.open(
format=pyaudio.paInt16, channels=1, rate=24000, output=True
format=pyaudio.paInt16, channels=1, rate=24000, output=True,
frames_per_buffer=1024
)
self.audio_queue = [] # 缓冲区
self.buffer_threshold = int(24000 * 2 * buffer_seconds) # 字节数 (16bit=2bytes)
self.is_playing = False
self.playback_thread = None
self.all_data_received = False
def on_open(self) -> None:
print('[TTS] 连接已建立')
def on_close(self, close_status_code, close_msg) -> None:
self.all_data_received = True
if self.playback_thread:
self.playback_thread.join()
if self._stream:
self._stream.stop_stream()
self._stream.close()
@@ -47,6 +57,22 @@ class MyCallback(QwenTtsRealtimeCallback):
self._player.terminate()
print(f'[TTS] 连接关闭 code={close_status_code}, msg={close_msg}')
def _playback_worker(self):
"""播放线程,从缓冲区读取并播放"""
print("[TTS] 缓冲区达到阈值,开始播放...")
self.playback_start_event.set() # 触发播放开始事件
chunk_count = 0
while not self.all_data_received or self.audio_queue:
if self.audio_queue:
data = self.audio_queue.pop(0)
self._stream.write(data)
chunk_count += 1
else:
if self.all_data_received:
break
time.sleep(0.01) # 等待数据
print(f"[TTS] 播放线程结束,共播放 {chunk_count} 个数据块")
def on_event(self, response: dict) -> None:
try:
event_type = response.get('type', '')
@@ -54,17 +80,32 @@ class MyCallback(QwenTtsRealtimeCallback):
print(f'[TTS] 会话开始: {response["session"]["id"]}')
elif event_type == 'response.audio.delta':
audio_data = base64.b64decode(response['delta'])
self._stream.write(audio_data)
self.audio_queue.append(audio_data)
# 计算当前已缓冲的总字节数
current_buffer_size = sum(len(d) for d in self.audio_queue)
# 如果还没开始播放且达到阈值,启动播放线程
if not self.is_playing and current_buffer_size >= self.buffer_threshold:
self.is_playing = True
self.playback_thread = threading.Thread(target=self._playback_worker)
self.playback_thread.start()
elif event_type == 'response.done':
print(f'[TTS] 响应完成')
total_size = sum(len(d) for d in self.audio_queue)
print(f'[TTS] 响应完成,总接收音频大小: {total_size} 字节')
elif event_type == 'session.finished':
print('[TTS] 会话结束')
self.all_data_received = True
self.complete_event.set()
except Exception as e:
print(f'[Error] 处理回调事件异常: {e}')
def wait_for_finished(self):
self.complete_event.wait()
# 确保播放线程也执行完毕
if self.playback_thread and self.playback_thread.is_alive():
self.playback_thread.join()
class QwenTTSManager:
"""
@@ -109,11 +150,11 @@ class QwenTTSManager:
return resp.json()["output"]["voice"]
def start_synthesis(self, voice_id, text_list):
def start_synthesis(self, voice_id, text_list, wait_finished=True, buffer_seconds=0.5):
"""
开始实时语音合成并播放
"""
self.callback = MyCallback()
self.callback = MyCallback(buffer_seconds=buffer_seconds)
self.qwen_tts_realtime = QwenTtsRealtime(
model=self.model,
callback=self.callback,
@@ -133,4 +174,13 @@ class QwenTTSManager:
time.sleep(0.1)
self.qwen_tts_realtime.finish()
self.callback.wait_for_finished()
if wait_finished:
self.callback.wait_for_finished()
return self.callback
def wait_for_playback_start(self):
"""等待播放实际开始"""
if self.callback:
self.callback.playback_start_event.wait()