Files
aiData/DouYin/Test/T2_BigV.py

204 lines
8.2 KiB
Python
Raw Normal View History

2026-02-27 14:57:13 +08:00
"""
Douyin User Profile & Video Crawler (T2)
访问注释中的页面获取页面中的内容信息用户信息及最近作品
URL: https://www.douyin.com/user/MS4wLjABAAAA2P7MeZl0VUsDmCzKbZeLlVGVTDRAuXmvr_zcC6XNqd-6R4n9ssCguSgA-gnBfjUO
"""
import sys
import os
import re
import json
import yaml
import time
import io
from datetime import datetime
# 强制设置控制台输出编码为 UTF-8解决 Windows 环境下的乱码问题
if sys.platform.startswith('win'):
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
# =================================================================
# 1. 环境配置与路径初始化
# =================================================================
current_file_path = os.path.abspath(__file__)
current_dir = os.path.dirname(current_file_path)
douyin_root = os.path.dirname(current_dir)
# 将 DouYin 根目录添加到 sys.path
if douyin_root not in sys.path:
sys.path.append(douyin_root)
# 尝试导入核心爬虫模块
try:
from apiproxy.douyin.douyin import Douyin
from apiproxy.douyin import douyin_headers
except ImportError as e:
print(f"[ Error ]: Failed to import core modules. Error: {e}")
sys.exit(1)
# =================================================================
# 2. 工具函数定义
# =================================================================
def extract_url_from_file(file_path):
"""从文件头部的注释中提取抖音用户 URL"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
urls = re.findall(r'https?://(?:www\.)?douyin\.com/user/[a-zA-Z0-9\-_]+', content)
return urls[0] if urls else None
except Exception as e:
print(f"[ Error ]: Failed to read script file: {e}")
return None
def load_config():
"""从 Test 目录或项目根目录加载 config_douyin.yml 配置"""
# 优先检查当前 Test 目录
test_config_path = os.path.join(current_dir, "config_douyin.yml")
# 其次检查 DouYin 根目录
root_config_path = os.path.join(douyin_root, "config_douyin.yml")
config_path = test_config_path if os.path.exists(test_config_path) else root_config_path
if os.path.exists(config_path):
try:
with open(config_path, 'r', encoding='utf-8') as f:
print(f"[ Info ]: Loading config from: {os.path.abspath(config_path)}")
return yaml.safe_load(f)
except Exception as e:
print(f"[ Warning ]: Failed to parse config: {e}")
else:
print(f"[ Warning ]: config_douyin.yml not found.")
return {}
def setup_cookies(config):
"""设置 Douyin 请求所需的 Cookie"""
# 优先使用完整的全局 cookie 字符串
cookie_str = config.get("cookie", "")
cookies_dict = config.get("cookies", {})
if cookie_str:
# 如果有 raw cookie确保 msToken 也被包含进去(如果 dict 中有的话)
if cookies_dict.get("msToken") and "msToken=" not in cookie_str:
cookie_str = f"msToken={cookies_dict['msToken']}; " + cookie_str
douyin_headers["Cookie"] = cookie_str
print(f"[ Info ]: Using raw cookie string (Length: {len(cookie_str)})")
elif cookies_dict:
cookie_str = "; ".join(f"{k}={v}" for k, v in cookies_dict.items())
douyin_headers["Cookie"] = cookie_str
print(f"[ Info ]: Using combined cookies from dict (Count: {len(cookies_dict)})")
if not douyin_headers.get("Cookie"):
print("[ Warning ]: No valid Cookie detected. Most API calls will fail.")
print(" Please run T1_GetCookie.py first.")
# =================================================================
# 3. 核心爬取逻辑
# =================================================================
def crawl_user_profile(url):
"""获取用户信息及其最近的作品列表"""
dy = Douyin()
# 增加接口重试的超时时间
dy.timeout = 30
print(f"[ Start ]: Target URL: {url}")
# 步骤 1: 解析 URL 获取 sec_uid
print("[ Step 1/2 ]: Fetching blogger profile...")
try:
# 使用 getKey 获取 sec_uid
key_type, sec_uid = dy.getKey(url)
if key_type != "user" or not sec_uid:
print(f"[ Failed ]: Invalid URL or failed to parse sec_uid. Type: {key_type}")
return
print(f" (sec_uid: {sec_uid[:15]}...)")
# 获取详细信息
# 现在核心类 Douyin 已更新,会自动从 Cookie 中提取并拼接 msToken
print(f"[ Step 1/2 ]: Fetching blogger profile...")
user_detail = dy.getUserDetailInfo(sec_uid)
if not user_detail or user_detail.get("status_code") != 0:
print(f"[ Failed ]: Failed to fetch profile. Status: {user_detail.get('status_code') if user_detail else 'None'}")
if user_detail:
print(f" Msg: {user_detail.get('status_msg', 'Unknown error')}")
print(" Hint: Please check if Cookie is expired or network is restricted.")
return
except Exception as e:
print(f"[ Error ]: An unexpected error occurred: {e}")
return
user_data = user_detail.get("user", {})
nickname = user_data.get('nickname', 'Unknown')
# 打印博主信息
print("\n" + "="*60)
print(f"Blogger: {nickname}")
print("-" * 60)
print(f"ID: {user_data.get('unique_id') or user_data.get('short_id', 'Unknown')}")
print(f"Bio: {user_data.get('signature', 'N/A')}")
print(f"Followers:{user_data.get('m_follower_count') or user_data.get('follower_count', 0)}")
print(f"Likes: {user_data.get('total_favorited', 0)}")
print(f"Following:{user_data.get('following_count', 0)}")
print("="*60 + "\n")
# 步骤 3: 获取最近的作品列表
print(f"[ Step 2/2 ]: Fetching recent works...")
aweme_list = dy.getUserInfo(sec_uid, count=10)
result_data = {
"user_info": user_data,
"recent_videos": aweme_list or [],
"crawl_time": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
if aweme_list:
print(f"Successfully fetched {len(aweme_list)} videos:\n")
for i, aweme in enumerate(aweme_list, 1):
ctime = aweme.get('create_time')
if isinstance(ctime, (int, float)):
ctime_str = datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M')
else:
ctime_str = str(ctime)
desc = aweme.get('desc', 'No Title')
desc = (desc[:47] + "...") if len(desc) > 50 else desc
stats = aweme.get('statistics', {})
print(f"{i:02d}. [{ctime_str}] {desc}")
print(f" ❤️ {stats.get('digg_count', 0):<8} 💬 {stats.get('comment_count', 0):<8}{stats.get('collect_count', 0)}")
print(f" 🔗 https://www.douyin.com/video/{aweme.get('aweme_id')}\n")
else:
print("[ Info ]: No public videos found. Account might be private or API limited.")
# 步骤 4: 保存数据
output_file = os.path.join(current_dir, f"user_data_{sec_uid[:8]}.json")
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result_data, f, ensure_ascii=False, indent=4)
print(f"[ Success ]: Data saved to: {os.path.basename(output_file)}")
except Exception as e:
print(f"[ Warning ]: Failed to save data: {e}")
# =================================================================
# 4. 主入口
# =================================================================
if __name__ == "__main__":
# 提取 URL
target_url = extract_url_from_file(current_file_path)
if target_url:
# 配置环境
config = load_config()
setup_cookies(config)
# 执行爬取
crawl_user_profile(target_url)
else:
print("[ Error ]: No valid Douyin URL found in file comments.")