164 lines
6.4 KiB
Python
164 lines
6.4 KiB
Python
|
|
import os
|
||
|
|
import re
|
||
|
|
import logging
|
||
|
|
import yt_dlp
|
||
|
|
import uuid
|
||
|
|
import requests
|
||
|
|
from typing import Optional, Tuple
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
class DouYinDownloader:
|
||
|
|
def __init__(self):
|
||
|
|
self.mobile_headers = {
|
||
|
|
"User-Agent": "Mozilla/5.0 (Linux; Android 10; SM-G960F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36",
|
||
|
|
"Referer": "https://www.douyin.com/"
|
||
|
|
}
|
||
|
|
self.ydl_opts = {
|
||
|
|
'format': 'best', # Download best quality
|
||
|
|
'outtmpl': '%(id)s.%(ext)s',
|
||
|
|
'quiet': True,
|
||
|
|
'no_warnings': True,
|
||
|
|
'http_headers': self.mobile_headers,
|
||
|
|
# 'proxy': '...', # Add proxy if needed
|
||
|
|
}
|
||
|
|
|
||
|
|
def parse_share_text(self, text: str) -> Optional[str]:
|
||
|
|
"""Extract first URL from share text"""
|
||
|
|
urls = self.extract_urls(text)
|
||
|
|
if urls:
|
||
|
|
return urls[0]
|
||
|
|
return None
|
||
|
|
|
||
|
|
def extract_urls(self, text: str) -> list[str]:
|
||
|
|
"""Extract all URLs from text"""
|
||
|
|
return re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
|
||
|
|
|
||
|
|
def extract_title_from_text(self, text: str) -> str:
|
||
|
|
"""
|
||
|
|
Extract title from share text by removing URLs and common prefixes
|
||
|
|
"""
|
||
|
|
# 1. Remove URLs
|
||
|
|
clean_text = re.sub(r'http[s]?://\S+', '', text)
|
||
|
|
|
||
|
|
# 2. Remove "Copy open Douyin..." prefix patterns
|
||
|
|
# Example: "3.00 12/28 d@n.dN VYZ:/ 复制打开抖音极速版,看看【聚合能研的作品】..."
|
||
|
|
# Pattern: Any chars + "复制打开抖音" + any chars + ",看看"
|
||
|
|
clean_text = re.sub(r'.*?复制打开抖音.*?,看看', '', clean_text)
|
||
|
|
|
||
|
|
# 3. Remove 【...】 if it's at the start (usually author name)
|
||
|
|
clean_text = re.sub(r'^\s*【.*?】', '', clean_text)
|
||
|
|
|
||
|
|
# 4. Clean up whitespace
|
||
|
|
clean_text = clean_text.strip()
|
||
|
|
|
||
|
|
# 5. If text is too long, truncate? No, keep it.
|
||
|
|
# If empty, return "Unknown Title"
|
||
|
|
return clean_text if clean_text else "Unknown Title"
|
||
|
|
|
||
|
|
def get_video_info(self, url: str) -> Tuple[Optional[str], Optional[str]]:
|
||
|
|
"""
|
||
|
|
Get video title and real URL using yt-dlp
|
||
|
|
Returns: (title, webpage_url)
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
with yt_dlp.YoutubeDL(self.ydl_opts) as ydl:
|
||
|
|
info = ydl.extract_info(url, download=False)
|
||
|
|
return info.get('title'), info.get('webpage_url')
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Error getting video info: {e}")
|
||
|
|
return None, None
|
||
|
|
|
||
|
|
def download_video_fallback(self, url: str, output_dir: str) -> Tuple[Optional[str], Optional[str]]:
|
||
|
|
"""
|
||
|
|
Fallback download method using requests and mobile User-Agent
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
logger.info(f"Attempting fallback download for {url}")
|
||
|
|
|
||
|
|
# 1. Get real URL (follow redirects)
|
||
|
|
session = requests.Session()
|
||
|
|
response = session.get(url, headers=self.mobile_headers, allow_redirects=True, timeout=10)
|
||
|
|
final_url = response.url
|
||
|
|
content = response.text
|
||
|
|
|
||
|
|
# 2. Extract video URL
|
||
|
|
video_url = None
|
||
|
|
urls = re.findall(r'"url_list":\["(.*?)"\]', content)
|
||
|
|
if urls:
|
||
|
|
for u in urls:
|
||
|
|
if "playwm" in u:
|
||
|
|
video_url = u.replace("\\u002F", "/")
|
||
|
|
break
|
||
|
|
|
||
|
|
if not video_url:
|
||
|
|
logger.error("Fallback: No video URL found in page content")
|
||
|
|
return None, None
|
||
|
|
|
||
|
|
# 3. Download video
|
||
|
|
file_uuid = str(uuid.uuid4())
|
||
|
|
filename = os.path.join(output_dir, f'{file_uuid}.mp4')
|
||
|
|
|
||
|
|
logger.info(f"Fallback downloading video from {video_url}")
|
||
|
|
|
||
|
|
# Use stream to download
|
||
|
|
r = requests.get(video_url, headers=self.mobile_headers, stream=True, timeout=30)
|
||
|
|
if r.status_code == 200:
|
||
|
|
with open(filename, 'wb') as f:
|
||
|
|
for chunk in r.iter_content(chunk_size=1024*1024):
|
||
|
|
if chunk:
|
||
|
|
f.write(chunk)
|
||
|
|
|
||
|
|
# Try to extract title
|
||
|
|
title = "Unknown Title"
|
||
|
|
title_match = re.search(r'<title>(.*?)</title>', content)
|
||
|
|
if title_match:
|
||
|
|
title = title_match.group(1).replace(" - 抖音", "")
|
||
|
|
|
||
|
|
return filename, title
|
||
|
|
else:
|
||
|
|
logger.error(f"Fallback download failed with status {r.status_code}")
|
||
|
|
return None, None
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Fallback download error: {e}")
|
||
|
|
return None, None
|
||
|
|
|
||
|
|
def download_video(self, url: str, output_dir: str) -> Tuple[Optional[str], Optional[str]]:
|
||
|
|
"""
|
||
|
|
Download video to output_dir with a UUID filename
|
||
|
|
Returns: (local_file_path, video_title)
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
if not os.path.exists(output_dir):
|
||
|
|
os.makedirs(output_dir)
|
||
|
|
|
||
|
|
file_uuid = str(uuid.uuid4())
|
||
|
|
# Update options for this download
|
||
|
|
opts = self.ydl_opts.copy()
|
||
|
|
opts['outtmpl'] = os.path.join(output_dir, f'{file_uuid}.%(ext)s')
|
||
|
|
|
||
|
|
try:
|
||
|
|
with yt_dlp.YoutubeDL(opts) as ydl:
|
||
|
|
info = ydl.extract_info(url, download=True)
|
||
|
|
filename = ydl.prepare_filename(info)
|
||
|
|
# Prepare filename might return the template, we need actual file
|
||
|
|
# If extension is merged, it might differ.
|
||
|
|
# But 'best' usually is mp4 for Douyin.
|
||
|
|
# Let's find the file.
|
||
|
|
if not os.path.exists(filename):
|
||
|
|
# Try finding it
|
||
|
|
for f in os.listdir(output_dir):
|
||
|
|
if f.startswith(file_uuid):
|
||
|
|
filename = os.path.join(output_dir, f)
|
||
|
|
break
|
||
|
|
|
||
|
|
return filename, info.get('title')
|
||
|
|
except Exception as e:
|
||
|
|
logger.warning(f"yt-dlp failed, trying fallback: {e}")
|
||
|
|
return self.download_video_fallback(url, output_dir)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Error downloading video: {e}")
|
||
|
|
return None, None
|