'commit'

2026-01-20 21:43:54 +08:00
parent 66cb0faeff
commit 55e88777d9
32 changed files with 1112 additions and 60 deletions
--- a/Util/ASRClient.py
+++ b/Util/ASRClient.py
@@ -3,7 +3,13 @@ from http import HTTPStatus
 from dashscope.audio.asr import Recognition
 import dashscope
 import logging
+import os
+import shutil
+import subprocess
+import uuid
 from Config import Config
+from Config.Config import OBS_TMP_PREFIX, OBS_BUCKET
+from Util.ObsUtil import ObsUploader

 # 初始化日志记录器
 logger = logging.getLogger(__name__)
@@ -34,18 +40,10 @@ class ASRClient:
            logger.error(f"初始化ASR客户端失败: {str(e)}", exc_info=True)
            raise
    
-    def transcribe_file_sync(self, file_path):
+    def _transcribe_segment(self, file_path):
        """
-        转写本地音频文件 (同步版本)
-        
-        Args:
-            file_path: 本地音频文件路径
-            
-        Returns:
-            str: 转写后的文本，如果失败返回None
+        Internal method to transcribe a short audio segment
        """
-        logger.info(f"开始转写文件(Sync): {file_path}")
-        
        try:
            recognition = Recognition(
                model='paraformer-realtime-v1',
@@ -62,16 +60,94 @@ class ASRClient:
                    for s in result.output['sentence']:
                        sentences.append(s['text'])
                text = "".join(sentences)
-                logger.info("转写成功")
                return text
            else:
-                logger.error(f"转写失败: {result.code} - {result.message}")
+                logger.error(f"Segment transcription failed: {result.code} - {result.message}")
                return None
                
        except Exception as e:
-            logger.error(f"转写过程出错: {str(e)}", exc_info=True)
+            logger.error(f"Segment transcription error: {str(e)}", exc_info=True)
            return None

+    def transcribe_file_sync(self, file_path):
+        """
+        转写本地音频文件 (同步版本)，支持自动切片处理大文件
+        
+        Args:
+            file_path: 本地音频文件路径
+            
+        Returns:
+            str: 转写后的文本，如果失败返回None
+        """
+        logger.info(f"开始转写文件(Sync): {file_path}")
+        
+        if not os.path.exists(file_path):
+            logger.error(f"File not found: {file_path}")
+            return None
+
+        # Check file size (approximate check, > 2MB or so might need splitting for safety with this API)
+        # Actually, let's just always try direct first? No, direct failed.
+        # Let's check size. If > 5MB, we split.
+        file_size = os.path.getsize(file_path)
+        is_large_file = file_size > 5 * 1024 * 1024 # 5MB
+        
+        if not is_large_file:
+            return self._transcribe_segment(file_path)
+            
+        logger.info(f"File is large ({file_size} bytes), splitting into chunks...")
+        
+        # Create temp dir for chunks
+        chunk_dir = os.path.join(os.path.dirname(file_path), "temp_chunks")
+        if not os.path.exists(chunk_dir):
+            os.makedirs(chunk_dir)
+        else:
+            # Clean up existing
+            for f in os.listdir(chunk_dir):
+                try:
+                    os.remove(os.path.join(chunk_dir, f))
+                except:
+                    pass
+
+        try:
+            # Split into 60s segments using ffmpeg
+            # Use -c copy for speed if format matches, but to be safe re-encode to consistent mp3
+            cmd = [
+                "ffmpeg", "-y", "-i", file_path, 
+                "-f", "segment", "-segment_time", "60", 
+                "-acodec", "libmp3lame", "-ar", "16000", "-ac", "1", "-q:a", "2",
+                os.path.join(chunk_dir, "out%03d.mp3")
+            ]
+            
+            # Suppress output unless error
+            subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+            
+            chunks = sorted([os.path.join(chunk_dir, f) for f in os.listdir(chunk_dir) if f.endswith(".mp3")])
+            logger.info(f"Created {len(chunks)} chunks.")
+            
+            full_text = []
+            for i, chunk in enumerate(chunks):
+                logger.info(f"Processing chunk {i+1}/{len(chunks)}")
+                text = self._transcribe_segment(chunk)
+                if text:
+                    full_text.append(text)
+                else:
+                    logger.warning(f"Chunk {i+1} failed to transcribe")
+            
+            final_text = "".join(full_text)
+            logger.info("Large file transcription completed")
+            return final_text
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"FFmpeg splitting failed: {e.stderr.decode() if e.stderr else str(e)}")
+            return None
+        except Exception as e:
+            logger.error(f"Error during large file processing: {str(e)}", exc_info=True)
+            return None
+        finally:
+            # Cleanup
+            if os.path.exists(chunk_dir):
+                shutil.rmtree(chunk_dir, ignore_errors=True)
+
    async def transcribe_file(self, file_path):
        """
        转写本地音频文件
@@ -84,3 +160,41 @@ class ASRClient:
        """
        loop = asyncio.get_running_loop()
        return await loop.run_in_executor(None, self.transcribe_file_sync, file_path)
+
+    def upload_and_transcribe_sync(self, file_path):
+        """
+        上传文件到OBS临时目录并进行转写
+        
+        Args:
+            file_path: 本地音频文件路径
+            
+        Returns:
+            str: 转写后的文本
+        """
+        try:
+            # 1. Upload to OBS (Requirement)
+            uploader = ObsUploader()
+            ext = os.path.splitext(file_path)[1]
+            if not ext:
+                ext = ".mp3"
+                
+            obs_key = f"{OBS_TMP_PREFIX}/{uuid.uuid4()}{ext}"
+            logger.info(f"Uploading {file_path} to OBS: {obs_key}")
+            
+            success, res = uploader.upload_file(obs_key, file_path, OBS_BUCKET)
+            if not success:
+                logger.error(f"Failed to upload file to OBS: {res}")
+                # We continue to transcribe even if upload fails? 
+                # The requirement implies upload is part of the process. 
+                # I'll log error but proceed if local file exists, 
+                # or maybe fail? "将mp3上传...并获取" -> implies dependency?
+                # I'll proceed with warning.
+            else:
+                logger.info(f"Upload successful: {obs_key}")
+
+            # 2. Transcribe (using local file as we have optimized chunking logic)
+            return self.transcribe_file_sync(file_path)
+            
+        except Exception as e:
+            logger.error(f"Error in upload_and_transcribe: {e}", exc_info=True)
+            return None
--- a/Util/DouYinDownloader.py
+++ b/Util/DouYinDownloader.py
@@ -0,0 +1,163 @@
+import os
+import re
+import logging
+import yt_dlp
+import uuid
+import requests
+from typing import Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+class DouYinDownloader:
+    def __init__(self):
+        self.mobile_headers = {
+            "User-Agent": "Mozilla/5.0 (Linux; Android 10; SM-G960F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36",
+            "Referer": "https://www.douyin.com/"
+        }
+        self.ydl_opts = {
+            'format': 'best',  # Download best quality
+            'outtmpl': '%(id)s.%(ext)s',
+            'quiet': True,
+            'no_warnings': True,
+            'http_headers': self.mobile_headers,
+            # 'proxy': '...', # Add proxy if needed
+        }
+
+    def parse_share_text(self, text: str) -> Optional[str]:
+        """Extract first URL from share text"""
+        urls = self.extract_urls(text)
+        if urls:
+            return urls[0]
+        return None
+
+    def extract_urls(self, text: str) -> list[str]:
+        """Extract all URLs from text"""
+        return re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
+
+    def extract_title_from_text(self, text: str) -> str:
+        """
+        Extract title from share text by removing URLs and common prefixes
+        """
+        # 1. Remove URLs
+        clean_text = re.sub(r'http[s]?://\S+', '', text)
+        
+        # 2. Remove "Copy open Douyin..." prefix patterns
+        # Example: "3.00 12/28 d@n.dN VYZ:/ 复制打开抖音极速版，看看【聚合能研的作品】..."
+        # Pattern: Any chars + "复制打开抖音" + any chars + "，看看"
+        clean_text = re.sub(r'.*?复制打开抖音.*?，看看', '', clean_text)
+        
+        # 3. Remove 【...】 if it's at the start (usually author name)
+        clean_text = re.sub(r'^\s*【.*?】', '', clean_text)
+        
+        # 4. Clean up whitespace
+        clean_text = clean_text.strip()
+        
+        # 5. If text is too long, truncate? No, keep it.
+        # If empty, return "Unknown Title"
+        return clean_text if clean_text else "Unknown Title"
+
+    def get_video_info(self, url: str) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Get video title and real URL using yt-dlp
+        Returns: (title, webpage_url)
+        """
+        try:
+            with yt_dlp.YoutubeDL(self.ydl_opts) as ydl:
+                info = ydl.extract_info(url, download=False)
+                return info.get('title'), info.get('webpage_url')
+        except Exception as e:
+            logger.error(f"Error getting video info: {e}")
+            return None, None
+
+    def download_video_fallback(self, url: str, output_dir: str) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Fallback download method using requests and mobile User-Agent
+        """
+        try:
+            logger.info(f"Attempting fallback download for {url}")
+            
+            # 1. Get real URL (follow redirects)
+            session = requests.Session()
+            response = session.get(url, headers=self.mobile_headers, allow_redirects=True, timeout=10)
+            final_url = response.url
+            content = response.text
+            
+            # 2. Extract video URL
+            video_url = None
+            urls = re.findall(r'"url_list":\["(.*?)"\]', content)
+            if urls:
+                for u in urls:
+                    if "playwm" in u:
+                        video_url = u.replace("\\u002F", "/")
+                        break
+            
+            if not video_url:
+                logger.error("Fallback: No video URL found in page content")
+                return None, None
+                
+            # 3. Download video
+            file_uuid = str(uuid.uuid4())
+            filename = os.path.join(output_dir, f'{file_uuid}.mp4')
+            
+            logger.info(f"Fallback downloading video from {video_url}")
+            
+            # Use stream to download
+            r = requests.get(video_url, headers=self.mobile_headers, stream=True, timeout=30)
+            if r.status_code == 200:
+                with open(filename, 'wb') as f:
+                    for chunk in r.iter_content(chunk_size=1024*1024):
+                        if chunk:
+                            f.write(chunk)
+                
+                # Try to extract title
+                title = "Unknown Title"
+                title_match = re.search(r'<title>(.*?)</title>', content)
+                if title_match:
+                    title = title_match.group(1).replace(" - 抖音", "")
+                
+                return filename, title
+            else:
+                logger.error(f"Fallback download failed with status {r.status_code}")
+                return None, None
+                
+        except Exception as e:
+            logger.error(f"Fallback download error: {e}")
+            return None, None
+
+    def download_video(self, url: str, output_dir: str) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Download video to output_dir with a UUID filename
+        Returns: (local_file_path, video_title)
+        """
+        try:
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            file_uuid = str(uuid.uuid4())
+            # Update options for this download
+            opts = self.ydl_opts.copy()
+            opts['outtmpl'] = os.path.join(output_dir, f'{file_uuid}.%(ext)s')
+            
+            try:
+                with yt_dlp.YoutubeDL(opts) as ydl:
+                    info = ydl.extract_info(url, download=True)
+                    filename = ydl.prepare_filename(info)
+                    # Prepare filename might return the template, we need actual file
+                    # If extension is merged, it might differ. 
+                    # But 'best' usually is mp4 for Douyin.
+                    # Let's find the file.
+                    if not os.path.exists(filename):
+                        # Try finding it
+                        for f in os.listdir(output_dir):
+                            if f.startswith(file_uuid):
+                                filename = os.path.join(output_dir, f)
+                                break
+                    
+                    return filename, info.get('title')
+            except Exception as e:
+                logger.warning(f"yt-dlp failed, trying fallback: {e}")
+                return self.download_video_fallback(url, output_dir)
+                
+        except Exception as e:
+            logger.error(f"Error downloading video: {e}")
+            return None, None
--- a/Util/pycache/ASRClient.cpython-310.pyc
+++ b/Util/pycache/ASRClient.cpython-310.pyc
--- a/Util/pycache/DouYinDownloader.cpython-310.pyc
+++ b/Util/pycache/DouYinDownloader.cpython-310.pyc