'commit'
This commit is contained in:
@@ -3,7 +3,13 @@ from http import HTTPStatus
|
||||
from dashscope.audio.asr import Recognition
|
||||
import dashscope
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import uuid
|
||||
from Config import Config
|
||||
from Config.Config import OBS_TMP_PREFIX, OBS_BUCKET
|
||||
from Util.ObsUtil import ObsUploader
|
||||
|
||||
# 初始化日志记录器
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -34,18 +40,10 @@ class ASRClient:
|
||||
logger.error(f"初始化ASR客户端失败: {str(e)}", exc_info=True)
|
||||
raise
|
||||
|
||||
def transcribe_file_sync(self, file_path):
|
||||
def _transcribe_segment(self, file_path):
|
||||
"""
|
||||
转写本地音频文件 (同步版本)
|
||||
|
||||
Args:
|
||||
file_path: 本地音频文件路径
|
||||
|
||||
Returns:
|
||||
str: 转写后的文本,如果失败返回None
|
||||
Internal method to transcribe a short audio segment
|
||||
"""
|
||||
logger.info(f"开始转写文件(Sync): {file_path}")
|
||||
|
||||
try:
|
||||
recognition = Recognition(
|
||||
model='paraformer-realtime-v1',
|
||||
@@ -62,16 +60,94 @@ class ASRClient:
|
||||
for s in result.output['sentence']:
|
||||
sentences.append(s['text'])
|
||||
text = "".join(sentences)
|
||||
logger.info("转写成功")
|
||||
return text
|
||||
else:
|
||||
logger.error(f"转写失败: {result.code} - {result.message}")
|
||||
logger.error(f"Segment transcription failed: {result.code} - {result.message}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"转写过程出错: {str(e)}", exc_info=True)
|
||||
logger.error(f"Segment transcription error: {str(e)}", exc_info=True)
|
||||
return None
|
||||
|
||||
def transcribe_file_sync(self, file_path):
|
||||
"""
|
||||
转写本地音频文件 (同步版本),支持自动切片处理大文件
|
||||
|
||||
Args:
|
||||
file_path: 本地音频文件路径
|
||||
|
||||
Returns:
|
||||
str: 转写后的文本,如果失败返回None
|
||||
"""
|
||||
logger.info(f"开始转写文件(Sync): {file_path}")
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
logger.error(f"File not found: {file_path}")
|
||||
return None
|
||||
|
||||
# Check file size (approximate check, > 2MB or so might need splitting for safety with this API)
|
||||
# Actually, let's just always try direct first? No, direct failed.
|
||||
# Let's check size. If > 5MB, we split.
|
||||
file_size = os.path.getsize(file_path)
|
||||
is_large_file = file_size > 5 * 1024 * 1024 # 5MB
|
||||
|
||||
if not is_large_file:
|
||||
return self._transcribe_segment(file_path)
|
||||
|
||||
logger.info(f"File is large ({file_size} bytes), splitting into chunks...")
|
||||
|
||||
# Create temp dir for chunks
|
||||
chunk_dir = os.path.join(os.path.dirname(file_path), "temp_chunks")
|
||||
if not os.path.exists(chunk_dir):
|
||||
os.makedirs(chunk_dir)
|
||||
else:
|
||||
# Clean up existing
|
||||
for f in os.listdir(chunk_dir):
|
||||
try:
|
||||
os.remove(os.path.join(chunk_dir, f))
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Split into 60s segments using ffmpeg
|
||||
# Use -c copy for speed if format matches, but to be safe re-encode to consistent mp3
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-i", file_path,
|
||||
"-f", "segment", "-segment_time", "60",
|
||||
"-acodec", "libmp3lame", "-ar", "16000", "-ac", "1", "-q:a", "2",
|
||||
os.path.join(chunk_dir, "out%03d.mp3")
|
||||
]
|
||||
|
||||
# Suppress output unless error
|
||||
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
|
||||
|
||||
chunks = sorted([os.path.join(chunk_dir, f) for f in os.listdir(chunk_dir) if f.endswith(".mp3")])
|
||||
logger.info(f"Created {len(chunks)} chunks.")
|
||||
|
||||
full_text = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
logger.info(f"Processing chunk {i+1}/{len(chunks)}")
|
||||
text = self._transcribe_segment(chunk)
|
||||
if text:
|
||||
full_text.append(text)
|
||||
else:
|
||||
logger.warning(f"Chunk {i+1} failed to transcribe")
|
||||
|
||||
final_text = "".join(full_text)
|
||||
logger.info("Large file transcription completed")
|
||||
return final_text
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"FFmpeg splitting failed: {e.stderr.decode() if e.stderr else str(e)}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error during large file processing: {str(e)}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
# Cleanup
|
||||
if os.path.exists(chunk_dir):
|
||||
shutil.rmtree(chunk_dir, ignore_errors=True)
|
||||
|
||||
async def transcribe_file(self, file_path):
|
||||
"""
|
||||
转写本地音频文件
|
||||
@@ -84,3 +160,41 @@ class ASRClient:
|
||||
"""
|
||||
loop = asyncio.get_running_loop()
|
||||
return await loop.run_in_executor(None, self.transcribe_file_sync, file_path)
|
||||
|
||||
def upload_and_transcribe_sync(self, file_path):
|
||||
"""
|
||||
上传文件到OBS临时目录并进行转写
|
||||
|
||||
Args:
|
||||
file_path: 本地音频文件路径
|
||||
|
||||
Returns:
|
||||
str: 转写后的文本
|
||||
"""
|
||||
try:
|
||||
# 1. Upload to OBS (Requirement)
|
||||
uploader = ObsUploader()
|
||||
ext = os.path.splitext(file_path)[1]
|
||||
if not ext:
|
||||
ext = ".mp3"
|
||||
|
||||
obs_key = f"{OBS_TMP_PREFIX}/{uuid.uuid4()}{ext}"
|
||||
logger.info(f"Uploading {file_path} to OBS: {obs_key}")
|
||||
|
||||
success, res = uploader.upload_file(obs_key, file_path, OBS_BUCKET)
|
||||
if not success:
|
||||
logger.error(f"Failed to upload file to OBS: {res}")
|
||||
# We continue to transcribe even if upload fails?
|
||||
# The requirement implies upload is part of the process.
|
||||
# I'll log error but proceed if local file exists,
|
||||
# or maybe fail? "将mp3上传...并获取" -> implies dependency?
|
||||
# I'll proceed with warning.
|
||||
else:
|
||||
logger.info(f"Upload successful: {obs_key}")
|
||||
|
||||
# 2. Transcribe (using local file as we have optimized chunking logic)
|
||||
return self.transcribe_file_sync(file_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in upload_and_transcribe: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
163
Util/DouYinDownloader.py
Normal file
163
Util/DouYinDownloader.py
Normal file
@@ -0,0 +1,163 @@
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import yt_dlp
|
||||
import uuid
|
||||
import requests
|
||||
from typing import Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DouYinDownloader:
|
||||
def __init__(self):
|
||||
self.mobile_headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Linux; Android 10; SM-G960F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36",
|
||||
"Referer": "https://www.douyin.com/"
|
||||
}
|
||||
self.ydl_opts = {
|
||||
'format': 'best', # Download best quality
|
||||
'outtmpl': '%(id)s.%(ext)s',
|
||||
'quiet': True,
|
||||
'no_warnings': True,
|
||||
'http_headers': self.mobile_headers,
|
||||
# 'proxy': '...', # Add proxy if needed
|
||||
}
|
||||
|
||||
def parse_share_text(self, text: str) -> Optional[str]:
|
||||
"""Extract first URL from share text"""
|
||||
urls = self.extract_urls(text)
|
||||
if urls:
|
||||
return urls[0]
|
||||
return None
|
||||
|
||||
def extract_urls(self, text: str) -> list[str]:
|
||||
"""Extract all URLs from text"""
|
||||
return re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
|
||||
|
||||
def extract_title_from_text(self, text: str) -> str:
|
||||
"""
|
||||
Extract title from share text by removing URLs and common prefixes
|
||||
"""
|
||||
# 1. Remove URLs
|
||||
clean_text = re.sub(r'http[s]?://\S+', '', text)
|
||||
|
||||
# 2. Remove "Copy open Douyin..." prefix patterns
|
||||
# Example: "3.00 12/28 d@n.dN VYZ:/ 复制打开抖音极速版,看看【聚合能研的作品】..."
|
||||
# Pattern: Any chars + "复制打开抖音" + any chars + ",看看"
|
||||
clean_text = re.sub(r'.*?复制打开抖音.*?,看看', '', clean_text)
|
||||
|
||||
# 3. Remove 【...】 if it's at the start (usually author name)
|
||||
clean_text = re.sub(r'^\s*【.*?】', '', clean_text)
|
||||
|
||||
# 4. Clean up whitespace
|
||||
clean_text = clean_text.strip()
|
||||
|
||||
# 5. If text is too long, truncate? No, keep it.
|
||||
# If empty, return "Unknown Title"
|
||||
return clean_text if clean_text else "Unknown Title"
|
||||
|
||||
def get_video_info(self, url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Get video title and real URL using yt-dlp
|
||||
Returns: (title, webpage_url)
|
||||
"""
|
||||
try:
|
||||
with yt_dlp.YoutubeDL(self.ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
return info.get('title'), info.get('webpage_url')
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting video info: {e}")
|
||||
return None, None
|
||||
|
||||
def download_video_fallback(self, url: str, output_dir: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Fallback download method using requests and mobile User-Agent
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Attempting fallback download for {url}")
|
||||
|
||||
# 1. Get real URL (follow redirects)
|
||||
session = requests.Session()
|
||||
response = session.get(url, headers=self.mobile_headers, allow_redirects=True, timeout=10)
|
||||
final_url = response.url
|
||||
content = response.text
|
||||
|
||||
# 2. Extract video URL
|
||||
video_url = None
|
||||
urls = re.findall(r'"url_list":\["(.*?)"\]', content)
|
||||
if urls:
|
||||
for u in urls:
|
||||
if "playwm" in u:
|
||||
video_url = u.replace("\\u002F", "/")
|
||||
break
|
||||
|
||||
if not video_url:
|
||||
logger.error("Fallback: No video URL found in page content")
|
||||
return None, None
|
||||
|
||||
# 3. Download video
|
||||
file_uuid = str(uuid.uuid4())
|
||||
filename = os.path.join(output_dir, f'{file_uuid}.mp4')
|
||||
|
||||
logger.info(f"Fallback downloading video from {video_url}")
|
||||
|
||||
# Use stream to download
|
||||
r = requests.get(video_url, headers=self.mobile_headers, stream=True, timeout=30)
|
||||
if r.status_code == 200:
|
||||
with open(filename, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=1024*1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
# Try to extract title
|
||||
title = "Unknown Title"
|
||||
title_match = re.search(r'<title>(.*?)</title>', content)
|
||||
if title_match:
|
||||
title = title_match.group(1).replace(" - 抖音", "")
|
||||
|
||||
return filename, title
|
||||
else:
|
||||
logger.error(f"Fallback download failed with status {r.status_code}")
|
||||
return None, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fallback download error: {e}")
|
||||
return None, None
|
||||
|
||||
def download_video(self, url: str, output_dir: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Download video to output_dir with a UUID filename
|
||||
Returns: (local_file_path, video_title)
|
||||
"""
|
||||
try:
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
file_uuid = str(uuid.uuid4())
|
||||
# Update options for this download
|
||||
opts = self.ydl_opts.copy()
|
||||
opts['outtmpl'] = os.path.join(output_dir, f'{file_uuid}.%(ext)s')
|
||||
|
||||
try:
|
||||
with yt_dlp.YoutubeDL(opts) as ydl:
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl.prepare_filename(info)
|
||||
# Prepare filename might return the template, we need actual file
|
||||
# If extension is merged, it might differ.
|
||||
# But 'best' usually is mp4 for Douyin.
|
||||
# Let's find the file.
|
||||
if not os.path.exists(filename):
|
||||
# Try finding it
|
||||
for f in os.listdir(output_dir):
|
||||
if f.startswith(file_uuid):
|
||||
filename = os.path.join(output_dir, f)
|
||||
break
|
||||
|
||||
return filename, info.get('title')
|
||||
except Exception as e:
|
||||
logger.warning(f"yt-dlp failed, trying fallback: {e}")
|
||||
return self.download_video_fallback(url, output_dir)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading video: {e}")
|
||||
return None, None
|
||||
Binary file not shown.
BIN
Util/__pycache__/DouYinDownloader.cpython-310.pyc
Normal file
BIN
Util/__pycache__/DouYinDownloader.cpython-310.pyc
Normal file
Binary file not shown.
Reference in New Issue
Block a user