Files
aiData/Util/ASRClient.py
HuangHai 55e88777d9 'commit'
2026-01-20 21:43:54 +08:00

201 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
from http import HTTPStatus
from dashscope.audio.asr import Recognition
import dashscope
import logging
import os
import shutil
import subprocess
import uuid
from Config import Config
from Config.Config import OBS_TMP_PREFIX, OBS_BUCKET
from Util.ObsUtil import ObsUploader
# 初始化日志记录器
logger = logging.getLogger(__name__)
# 若没有配置环境变量请用百炼API Key将下行替换为dashscope.api_key = "sk-xxx"
dashscope.api_key = Config.ALY_LLM_API_KEY
class ASRClient:
"""
阿里云语音识别客户端,用于处理语音文件的转写任务
使用 Recognition API 支持本地文件实时转写
"""
def __init__(self, api_key=None):
"""
初始化ASR客户端
Args:
api_key: 阿里云DashScope API密钥若不提供则使用配置文件中的密钥
"""
logger.info("开始初始化ASR客户端")
try:
self.api_key = api_key or Config.ALY_LLM_API_KEY
dashscope.api_key = self.api_key
logger.info("ASR客户端初始化完成")
except Exception as e:
logger.error(f"初始化ASR客户端失败: {str(e)}", exc_info=True)
raise
def _transcribe_segment(self, file_path):
"""
Internal method to transcribe a short audio segment
"""
try:
recognition = Recognition(
model='paraformer-realtime-v1',
format='mp3',
sample_rate=16000,
callback=None
)
result = recognition.call(file_path)
if result.status_code == HTTPStatus.OK:
sentences = []
if 'sentence' in result.output:
for s in result.output['sentence']:
sentences.append(s['text'])
text = "".join(sentences)
return text
else:
logger.error(f"Segment transcription failed: {result.code} - {result.message}")
return None
except Exception as e:
logger.error(f"Segment transcription error: {str(e)}", exc_info=True)
return None
def transcribe_file_sync(self, file_path):
"""
转写本地音频文件 (同步版本),支持自动切片处理大文件
Args:
file_path: 本地音频文件路径
Returns:
str: 转写后的文本如果失败返回None
"""
logger.info(f"开始转写文件(Sync): {file_path}")
if not os.path.exists(file_path):
logger.error(f"File not found: {file_path}")
return None
# Check file size (approximate check, > 2MB or so might need splitting for safety with this API)
# Actually, let's just always try direct first? No, direct failed.
# Let's check size. If > 5MB, we split.
file_size = os.path.getsize(file_path)
is_large_file = file_size > 5 * 1024 * 1024 # 5MB
if not is_large_file:
return self._transcribe_segment(file_path)
logger.info(f"File is large ({file_size} bytes), splitting into chunks...")
# Create temp dir for chunks
chunk_dir = os.path.join(os.path.dirname(file_path), "temp_chunks")
if not os.path.exists(chunk_dir):
os.makedirs(chunk_dir)
else:
# Clean up existing
for f in os.listdir(chunk_dir):
try:
os.remove(os.path.join(chunk_dir, f))
except:
pass
try:
# Split into 60s segments using ffmpeg
# Use -c copy for speed if format matches, but to be safe re-encode to consistent mp3
cmd = [
"ffmpeg", "-y", "-i", file_path,
"-f", "segment", "-segment_time", "60",
"-acodec", "libmp3lame", "-ar", "16000", "-ac", "1", "-q:a", "2",
os.path.join(chunk_dir, "out%03d.mp3")
]
# Suppress output unless error
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
chunks = sorted([os.path.join(chunk_dir, f) for f in os.listdir(chunk_dir) if f.endswith(".mp3")])
logger.info(f"Created {len(chunks)} chunks.")
full_text = []
for i, chunk in enumerate(chunks):
logger.info(f"Processing chunk {i+1}/{len(chunks)}")
text = self._transcribe_segment(chunk)
if text:
full_text.append(text)
else:
logger.warning(f"Chunk {i+1} failed to transcribe")
final_text = "".join(full_text)
logger.info("Large file transcription completed")
return final_text
except subprocess.CalledProcessError as e:
logger.error(f"FFmpeg splitting failed: {e.stderr.decode() if e.stderr else str(e)}")
return None
except Exception as e:
logger.error(f"Error during large file processing: {str(e)}", exc_info=True)
return None
finally:
# Cleanup
if os.path.exists(chunk_dir):
shutil.rmtree(chunk_dir, ignore_errors=True)
async def transcribe_file(self, file_path):
"""
转写本地音频文件
Args:
file_path: 本地音频文件路径
Returns:
str: 转写后的文本如果失败返回None
"""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self.transcribe_file_sync, file_path)
def upload_and_transcribe_sync(self, file_path):
"""
上传文件到OBS临时目录并进行转写
Args:
file_path: 本地音频文件路径
Returns:
str: 转写后的文本
"""
try:
# 1. Upload to OBS (Requirement)
uploader = ObsUploader()
ext = os.path.splitext(file_path)[1]
if not ext:
ext = ".mp3"
obs_key = f"{OBS_TMP_PREFIX}/{uuid.uuid4()}{ext}"
logger.info(f"Uploading {file_path} to OBS: {obs_key}")
success, res = uploader.upload_file(obs_key, file_path, OBS_BUCKET)
if not success:
logger.error(f"Failed to upload file to OBS: {res}")
# We continue to transcribe even if upload fails?
# The requirement implies upload is part of the process.
# I'll log error but proceed if local file exists,
# or maybe fail? "将mp3上传...并获取" -> implies dependency?
# I'll proceed with warning.
else:
logger.info(f"Upload successful: {obs_key}")
# 2. Transcribe (using local file as we have optimized chunking logic)
return self.transcribe_file_sync(file_path)
except Exception as e:
logger.error(f"Error in upload_and_transcribe: {e}", exc_info=True)
return None