Files
aiData/Util/DouYinDownloader.py
HuangHai 55e88777d9 'commit'
2026-01-20 21:43:54 +08:00

164 lines
6.4 KiB
Python

import os
import re
import logging
import yt_dlp
import uuid
import requests
from typing import Optional, Tuple
logger = logging.getLogger(__name__)
class DouYinDownloader:
def __init__(self):
self.mobile_headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 10; SM-G960F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36",
"Referer": "https://www.douyin.com/"
}
self.ydl_opts = {
'format': 'best', # Download best quality
'outtmpl': '%(id)s.%(ext)s',
'quiet': True,
'no_warnings': True,
'http_headers': self.mobile_headers,
# 'proxy': '...', # Add proxy if needed
}
def parse_share_text(self, text: str) -> Optional[str]:
"""Extract first URL from share text"""
urls = self.extract_urls(text)
if urls:
return urls[0]
return None
def extract_urls(self, text: str) -> list[str]:
"""Extract all URLs from text"""
return re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
def extract_title_from_text(self, text: str) -> str:
"""
Extract title from share text by removing URLs and common prefixes
"""
# 1. Remove URLs
clean_text = re.sub(r'http[s]?://\S+', '', text)
# 2. Remove "Copy open Douyin..." prefix patterns
# Example: "3.00 12/28 d@n.dN VYZ:/ 复制打开抖音极速版,看看【聚合能研的作品】..."
# Pattern: Any chars + "复制打开抖音" + any chars + ",看看"
clean_text = re.sub(r'.*?复制打开抖音.*?,看看', '', clean_text)
# 3. Remove 【...】 if it's at the start (usually author name)
clean_text = re.sub(r'^\s*【.*?】', '', clean_text)
# 4. Clean up whitespace
clean_text = clean_text.strip()
# 5. If text is too long, truncate? No, keep it.
# If empty, return "Unknown Title"
return clean_text if clean_text else "Unknown Title"
def get_video_info(self, url: str) -> Tuple[Optional[str], Optional[str]]:
"""
Get video title and real URL using yt-dlp
Returns: (title, webpage_url)
"""
try:
with yt_dlp.YoutubeDL(self.ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
return info.get('title'), info.get('webpage_url')
except Exception as e:
logger.error(f"Error getting video info: {e}")
return None, None
def download_video_fallback(self, url: str, output_dir: str) -> Tuple[Optional[str], Optional[str]]:
"""
Fallback download method using requests and mobile User-Agent
"""
try:
logger.info(f"Attempting fallback download for {url}")
# 1. Get real URL (follow redirects)
session = requests.Session()
response = session.get(url, headers=self.mobile_headers, allow_redirects=True, timeout=10)
final_url = response.url
content = response.text
# 2. Extract video URL
video_url = None
urls = re.findall(r'"url_list":\["(.*?)"\]', content)
if urls:
for u in urls:
if "playwm" in u:
video_url = u.replace("\\u002F", "/")
break
if not video_url:
logger.error("Fallback: No video URL found in page content")
return None, None
# 3. Download video
file_uuid = str(uuid.uuid4())
filename = os.path.join(output_dir, f'{file_uuid}.mp4')
logger.info(f"Fallback downloading video from {video_url}")
# Use stream to download
r = requests.get(video_url, headers=self.mobile_headers, stream=True, timeout=30)
if r.status_code == 200:
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024*1024):
if chunk:
f.write(chunk)
# Try to extract title
title = "Unknown Title"
title_match = re.search(r'<title>(.*?)</title>', content)
if title_match:
title = title_match.group(1).replace(" - 抖音", "")
return filename, title
else:
logger.error(f"Fallback download failed with status {r.status_code}")
return None, None
except Exception as e:
logger.error(f"Fallback download error: {e}")
return None, None
def download_video(self, url: str, output_dir: str) -> Tuple[Optional[str], Optional[str]]:
"""
Download video to output_dir with a UUID filename
Returns: (local_file_path, video_title)
"""
try:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_uuid = str(uuid.uuid4())
# Update options for this download
opts = self.ydl_opts.copy()
opts['outtmpl'] = os.path.join(output_dir, f'{file_uuid}.%(ext)s')
try:
with yt_dlp.YoutubeDL(opts) as ydl:
info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info)
# Prepare filename might return the template, we need actual file
# If extension is merged, it might differ.
# But 'best' usually is mp4 for Douyin.
# Let's find the file.
if not os.path.exists(filename):
# Try finding it
for f in os.listdir(output_dir):
if f.startswith(file_uuid):
filename = os.path.join(output_dir, f)
break
return filename, info.get('title')
except Exception as e:
logger.warning(f"yt-dlp failed, trying fallback: {e}")
return self.download_video_fallback(url, output_dir)
except Exception as e:
logger.error(f"Error downloading video: {e}")
return None, None