aiData/Util/DouYinDownloader.py

import os
import re
import logging
import yt_dlp
import uuid
import requests
from typing import Optional, Tuple

logger = logging.getLogger(__name__)

class DouYinDownloader:
    def __init__(self):
        self.mobile_headers = {
            "User-Agent": "Mozilla/5.0 (Linux; Android 10; SM-G960F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36",
            "Referer": "https://www.douyin.com/"
        }
        self.ydl_opts = {
            'format': 'best',  # Download best quality
            'outtmpl': '%(id)s.%(ext)s',
            'quiet': True,
            'no_warnings': True,
            'http_headers': self.mobile_headers,
            # 'proxy': '...', # Add proxy if needed
        }

    def parse_share_text(self, text: str) -> Optional[str]:
        """Extract first URL from share text"""
        urls = self.extract_urls(text)
        if urls:
            return urls[0]
        return None

    def extract_urls(self, text: str) -> list[str]:
        """Extract all URLs from text"""
        return re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)

    def extract_title_from_text(self, text: str) -> str:
        """
        Extract title from share text by removing URLs and common prefixes
        """
        # 1. Remove URLs
        clean_text = re.sub(r'http[s]?://\S+', '', text)

        # 2. Remove "Copy open Douyin..." prefix patterns
        # Example: "3.00 12/28 d@n.dN VYZ:/ 复制打开抖音极速版，看看【聚合能研的作品】..."
        # Pattern: Any chars + "复制打开抖音" + any chars + "，看看"
        clean_text = re.sub(r'.*?复制打开抖音.*?，看看', '', clean_text)

        # 3. Remove 【...】 if it's at the start (usually author name)
        clean_text = re.sub(r'^\s*【.*?】', '', clean_text)

        # 4. Clean up whitespace
        clean_text = clean_text.strip()

        # 5. If text is too long, truncate? No, keep it.
        # If empty, return "Unknown Title"
        return clean_text if clean_text else "Unknown Title"

    def get_video_info(self, url: str) -> Tuple[Optional[str], Optional[str]]:
        """
        Get video title and real URL using yt-dlp
        Returns: (title, webpage_url)
        """
        try:
            with yt_dlp.YoutubeDL(self.ydl_opts) as ydl:
                info = ydl.extract_info(url, download=False)
                return info.get('title'), info.get('webpage_url')
        except Exception as e:
            logger.error(f"Error getting video info: {e}")
            return None, None

    def download_video_fallback(self, url: str, output_dir: str) -> Tuple[Optional[str], Optional[str]]:
        """
        Fallback download method using requests and mobile User-Agent
        """
        try:
            logger.info(f"Attempting fallback download for {url}")

            # 1. Get real URL (follow redirects)
            session = requests.Session()
            response = session.get(url, headers=self.mobile_headers, allow_redirects=True, timeout=10)
            final_url = response.url
            content = response.text

            # 2. Extract video URL
            video_url = None
            urls = re.findall(r'"url_list":\["(.*?)"\]', content)
            if urls:
                for u in urls:
                    if "playwm" in u:
                        video_url = u.replace("\\u002F", "/")
                        break

            if not video_url:
                logger.error("Fallback: No video URL found in page content")
                return None, None

            # 3. Download video
            file_uuid = str(uuid.uuid4())
            filename = os.path.join(output_dir, f'{file_uuid}.mp4')

            logger.info(f"Fallback downloading video from {video_url}")

            # Use stream to download
            r = requests.get(video_url, headers=self.mobile_headers, stream=True, timeout=30)
            if r.status_code == 200:
                with open(filename, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=1024*1024):
                        if chunk:
                            f.write(chunk)

                # Try to extract title
                title = "Unknown Title"
                title_match = re.search(r'<title>(.*?)</title>', content)
                if title_match:
                    title = title_match.group(1).replace(" - 抖音", "")

                return filename, title
            else:
                logger.error(f"Fallback download failed with status {r.status_code}")
                return None, None

        except Exception as e:
            logger.error(f"Fallback download error: {e}")
            return None, None

    def download_video(self, url: str, output_dir: str) -> Tuple[Optional[str], Optional[str]]:
        """
        Download video to output_dir with a UUID filename
        Returns: (local_file_path, video_title)
        """
        try:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            file_uuid = str(uuid.uuid4())
            # Update options for this download
            opts = self.ydl_opts.copy()
            opts['outtmpl'] = os.path.join(output_dir, f'{file_uuid}.%(ext)s')

            try:
                with yt_dlp.YoutubeDL(opts) as ydl:
                    info = ydl.extract_info(url, download=True)
                    filename = ydl.prepare_filename(info)
                    # Prepare filename might return the template, we need actual file
                    # If extension is merged, it might differ.
                    # But 'best' usually is mp4 for Douyin.
                    # Let's find the file.
                    if not os.path.exists(filename):
                        # Try finding it
                        for f in os.listdir(output_dir):
                            if f.startswith(file_uuid):
                                filename = os.path.join(output_dir, f)
                                break

                    return filename, info.get('title')
            except Exception as e:
                logger.warning(f"yt-dlp failed, trying fallback: {e}")
                return self.download_video_fallback(url, output_dir)

        except Exception as e:
            logger.error(f"Error downloading video: {e}")
            return None, None