Files
aiData/DouYin/Test/T2_BigV.py
HuangHai f665e38bc0 'commit'
2026-02-27 14:57:13 +08:00

204 lines
8.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Douyin User Profile & Video Crawler (T2)
访问注释中的页面,获取页面中的内容信息(用户信息及最近作品)
URL: https://www.douyin.com/user/MS4wLjABAAAA2P7MeZl0VUsDmCzKbZeLlVGVTDRAuXmvr_zcC6XNqd-6R4n9ssCguSgA-gnBfjUO
"""
import sys
import os
import re
import json
import yaml
import time
import io
from datetime import datetime
# 强制设置控制台输出编码为 UTF-8解决 Windows 环境下的乱码问题
if sys.platform.startswith('win'):
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
# =================================================================
# 1. 环境配置与路径初始化
# =================================================================
current_file_path = os.path.abspath(__file__)
current_dir = os.path.dirname(current_file_path)
douyin_root = os.path.dirname(current_dir)
# 将 DouYin 根目录添加到 sys.path
if douyin_root not in sys.path:
sys.path.append(douyin_root)
# 尝试导入核心爬虫模块
try:
from apiproxy.douyin.douyin import Douyin
from apiproxy.douyin import douyin_headers
except ImportError as e:
print(f"[ Error ]: Failed to import core modules. Error: {e}")
sys.exit(1)
# =================================================================
# 2. 工具函数定义
# =================================================================
def extract_url_from_file(file_path):
"""从文件头部的注释中提取抖音用户 URL"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
urls = re.findall(r'https?://(?:www\.)?douyin\.com/user/[a-zA-Z0-9\-_]+', content)
return urls[0] if urls else None
except Exception as e:
print(f"[ Error ]: Failed to read script file: {e}")
return None
def load_config():
"""从 Test 目录或项目根目录加载 config_douyin.yml 配置"""
# 优先检查当前 Test 目录
test_config_path = os.path.join(current_dir, "config_douyin.yml")
# 其次检查 DouYin 根目录
root_config_path = os.path.join(douyin_root, "config_douyin.yml")
config_path = test_config_path if os.path.exists(test_config_path) else root_config_path
if os.path.exists(config_path):
try:
with open(config_path, 'r', encoding='utf-8') as f:
print(f"[ Info ]: Loading config from: {os.path.abspath(config_path)}")
return yaml.safe_load(f)
except Exception as e:
print(f"[ Warning ]: Failed to parse config: {e}")
else:
print(f"[ Warning ]: config_douyin.yml not found.")
return {}
def setup_cookies(config):
"""设置 Douyin 请求所需的 Cookie"""
# 优先使用完整的全局 cookie 字符串
cookie_str = config.get("cookie", "")
cookies_dict = config.get("cookies", {})
if cookie_str:
# 如果有 raw cookie确保 msToken 也被包含进去(如果 dict 中有的话)
if cookies_dict.get("msToken") and "msToken=" not in cookie_str:
cookie_str = f"msToken={cookies_dict['msToken']}; " + cookie_str
douyin_headers["Cookie"] = cookie_str
print(f"[ Info ]: Using raw cookie string (Length: {len(cookie_str)})")
elif cookies_dict:
cookie_str = "; ".join(f"{k}={v}" for k, v in cookies_dict.items())
douyin_headers["Cookie"] = cookie_str
print(f"[ Info ]: Using combined cookies from dict (Count: {len(cookies_dict)})")
if not douyin_headers.get("Cookie"):
print("[ Warning ]: No valid Cookie detected. Most API calls will fail.")
print(" Please run T1_GetCookie.py first.")
# =================================================================
# 3. 核心爬取逻辑
# =================================================================
def crawl_user_profile(url):
"""获取用户信息及其最近的作品列表"""
dy = Douyin()
# 增加接口重试的超时时间
dy.timeout = 30
print(f"[ Start ]: Target URL: {url}")
# 步骤 1: 解析 URL 获取 sec_uid
print("[ Step 1/2 ]: Fetching blogger profile...")
try:
# 使用 getKey 获取 sec_uid
key_type, sec_uid = dy.getKey(url)
if key_type != "user" or not sec_uid:
print(f"[ Failed ]: Invalid URL or failed to parse sec_uid. Type: {key_type}")
return
print(f" (sec_uid: {sec_uid[:15]}...)")
# 获取详细信息
# 现在核心类 Douyin 已更新,会自动从 Cookie 中提取并拼接 msToken
print(f"[ Step 1/2 ]: Fetching blogger profile...")
user_detail = dy.getUserDetailInfo(sec_uid)
if not user_detail or user_detail.get("status_code") != 0:
print(f"[ Failed ]: Failed to fetch profile. Status: {user_detail.get('status_code') if user_detail else 'None'}")
if user_detail:
print(f" Msg: {user_detail.get('status_msg', 'Unknown error')}")
print(" Hint: Please check if Cookie is expired or network is restricted.")
return
except Exception as e:
print(f"[ Error ]: An unexpected error occurred: {e}")
return
user_data = user_detail.get("user", {})
nickname = user_data.get('nickname', 'Unknown')
# 打印博主信息
print("\n" + "="*60)
print(f"Blogger: {nickname}")
print("-" * 60)
print(f"ID: {user_data.get('unique_id') or user_data.get('short_id', 'Unknown')}")
print(f"Bio: {user_data.get('signature', 'N/A')}")
print(f"Followers:{user_data.get('m_follower_count') or user_data.get('follower_count', 0)}")
print(f"Likes: {user_data.get('total_favorited', 0)}")
print(f"Following:{user_data.get('following_count', 0)}")
print("="*60 + "\n")
# 步骤 3: 获取最近的作品列表
print(f"[ Step 2/2 ]: Fetching recent works...")
aweme_list = dy.getUserInfo(sec_uid, count=10)
result_data = {
"user_info": user_data,
"recent_videos": aweme_list or [],
"crawl_time": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
if aweme_list:
print(f"Successfully fetched {len(aweme_list)} videos:\n")
for i, aweme in enumerate(aweme_list, 1):
ctime = aweme.get('create_time')
if isinstance(ctime, (int, float)):
ctime_str = datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M')
else:
ctime_str = str(ctime)
desc = aweme.get('desc', 'No Title')
desc = (desc[:47] + "...") if len(desc) > 50 else desc
stats = aweme.get('statistics', {})
print(f"{i:02d}. [{ctime_str}] {desc}")
print(f" ❤️ {stats.get('digg_count', 0):<8} 💬 {stats.get('comment_count', 0):<8}{stats.get('collect_count', 0)}")
print(f" 🔗 https://www.douyin.com/video/{aweme.get('aweme_id')}\n")
else:
print("[ Info ]: No public videos found. Account might be private or API limited.")
# 步骤 4: 保存数据
output_file = os.path.join(current_dir, f"user_data_{sec_uid[:8]}.json")
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result_data, f, ensure_ascii=False, indent=4)
print(f"[ Success ]: Data saved to: {os.path.basename(output_file)}")
except Exception as e:
print(f"[ Warning ]: Failed to save data: {e}")
# =================================================================
# 4. 主入口
# =================================================================
if __name__ == "__main__":
# 提取 URL
target_url = extract_url_from_file(current_file_path)
if target_url:
# 配置环境
config = load_config()
setup_cookies(config)
# 执行爬取
crawl_user_profile(target_url)
else:
print("[ Error ]: No valid Douyin URL found in file comments.")