204 lines
8.2 KiB
Python
204 lines
8.2 KiB
Python
|
|
"""
|
|||
|
|
Douyin User Profile & Video Crawler (T2)
|
|||
|
|
访问注释中的页面,获取页面中的内容信息(用户信息及最近作品)
|
|||
|
|
|
|||
|
|
URL: https://www.douyin.com/user/MS4wLjABAAAA2P7MeZl0VUsDmCzKbZeLlVGVTDRAuXmvr_zcC6XNqd-6R4n9ssCguSgA-gnBfjUO
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import sys
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
import json
|
|||
|
|
import yaml
|
|||
|
|
import time
|
|||
|
|
import io
|
|||
|
|
from datetime import datetime
|
|||
|
|
|
|||
|
|
# 强制设置控制台输出编码为 UTF-8,解决 Windows 环境下的乱码问题
|
|||
|
|
if sys.platform.startswith('win'):
|
|||
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|||
|
|
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
|||
|
|
|
|||
|
|
# =================================================================
|
|||
|
|
# 1. 环境配置与路径初始化
|
|||
|
|
# =================================================================
|
|||
|
|
current_file_path = os.path.abspath(__file__)
|
|||
|
|
current_dir = os.path.dirname(current_file_path)
|
|||
|
|
douyin_root = os.path.dirname(current_dir)
|
|||
|
|
|
|||
|
|
# 将 DouYin 根目录添加到 sys.path
|
|||
|
|
if douyin_root not in sys.path:
|
|||
|
|
sys.path.append(douyin_root)
|
|||
|
|
|
|||
|
|
# 尝试导入核心爬虫模块
|
|||
|
|
try:
|
|||
|
|
from apiproxy.douyin.douyin import Douyin
|
|||
|
|
from apiproxy.douyin import douyin_headers
|
|||
|
|
except ImportError as e:
|
|||
|
|
print(f"[ Error ]: Failed to import core modules. Error: {e}")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
# =================================================================
|
|||
|
|
# 2. 工具函数定义
|
|||
|
|
# =================================================================
|
|||
|
|
|
|||
|
|
def extract_url_from_file(file_path):
|
|||
|
|
"""从文件头部的注释中提取抖音用户 URL"""
|
|||
|
|
try:
|
|||
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|||
|
|
content = f.read()
|
|||
|
|
urls = re.findall(r'https?://(?:www\.)?douyin\.com/user/[a-zA-Z0-9\-_]+', content)
|
|||
|
|
return urls[0] if urls else None
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[ Error ]: Failed to read script file: {e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def load_config():
|
|||
|
|
"""从 Test 目录或项目根目录加载 config_douyin.yml 配置"""
|
|||
|
|
# 优先检查当前 Test 目录
|
|||
|
|
test_config_path = os.path.join(current_dir, "config_douyin.yml")
|
|||
|
|
# 其次检查 DouYin 根目录
|
|||
|
|
root_config_path = os.path.join(douyin_root, "config_douyin.yml")
|
|||
|
|
|
|||
|
|
config_path = test_config_path if os.path.exists(test_config_path) else root_config_path
|
|||
|
|
|
|||
|
|
if os.path.exists(config_path):
|
|||
|
|
try:
|
|||
|
|
with open(config_path, 'r', encoding='utf-8') as f:
|
|||
|
|
print(f"[ Info ]: Loading config from: {os.path.abspath(config_path)}")
|
|||
|
|
return yaml.safe_load(f)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[ Warning ]: Failed to parse config: {e}")
|
|||
|
|
else:
|
|||
|
|
print(f"[ Warning ]: config_douyin.yml not found.")
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
def setup_cookies(config):
|
|||
|
|
"""设置 Douyin 请求所需的 Cookie"""
|
|||
|
|
# 优先使用完整的全局 cookie 字符串
|
|||
|
|
cookie_str = config.get("cookie", "")
|
|||
|
|
cookies_dict = config.get("cookies", {})
|
|||
|
|
|
|||
|
|
if cookie_str:
|
|||
|
|
# 如果有 raw cookie,确保 msToken 也被包含进去(如果 dict 中有的话)
|
|||
|
|
if cookies_dict.get("msToken") and "msToken=" not in cookie_str:
|
|||
|
|
cookie_str = f"msToken={cookies_dict['msToken']}; " + cookie_str
|
|||
|
|
douyin_headers["Cookie"] = cookie_str
|
|||
|
|
print(f"[ Info ]: Using raw cookie string (Length: {len(cookie_str)})")
|
|||
|
|
elif cookies_dict:
|
|||
|
|
cookie_str = "; ".join(f"{k}={v}" for k, v in cookies_dict.items())
|
|||
|
|
douyin_headers["Cookie"] = cookie_str
|
|||
|
|
print(f"[ Info ]: Using combined cookies from dict (Count: {len(cookies_dict)})")
|
|||
|
|
|
|||
|
|
if not douyin_headers.get("Cookie"):
|
|||
|
|
print("[ Warning ]: No valid Cookie detected. Most API calls will fail.")
|
|||
|
|
print(" Please run T1_GetCookie.py first.")
|
|||
|
|
|
|||
|
|
# =================================================================
|
|||
|
|
# 3. 核心爬取逻辑
|
|||
|
|
# =================================================================
|
|||
|
|
|
|||
|
|
def crawl_user_profile(url):
|
|||
|
|
"""获取用户信息及其最近的作品列表"""
|
|||
|
|
dy = Douyin()
|
|||
|
|
# 增加接口重试的超时时间
|
|||
|
|
dy.timeout = 30
|
|||
|
|
|
|||
|
|
print(f"[ Start ]: Target URL: {url}")
|
|||
|
|
|
|||
|
|
# 步骤 1: 解析 URL 获取 sec_uid
|
|||
|
|
print("[ Step 1/2 ]: Fetching blogger profile...")
|
|||
|
|
try:
|
|||
|
|
# 使用 getKey 获取 sec_uid
|
|||
|
|
key_type, sec_uid = dy.getKey(url)
|
|||
|
|
if key_type != "user" or not sec_uid:
|
|||
|
|
print(f"[ Failed ]: Invalid URL or failed to parse sec_uid. Type: {key_type}")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print(f" (sec_uid: {sec_uid[:15]}...)")
|
|||
|
|
|
|||
|
|
# 获取详细信息
|
|||
|
|
# 现在核心类 Douyin 已更新,会自动从 Cookie 中提取并拼接 msToken
|
|||
|
|
print(f"[ Step 1/2 ]: Fetching blogger profile...")
|
|||
|
|
user_detail = dy.getUserDetailInfo(sec_uid)
|
|||
|
|
|
|||
|
|
if not user_detail or user_detail.get("status_code") != 0:
|
|||
|
|
print(f"[ Failed ]: Failed to fetch profile. Status: {user_detail.get('status_code') if user_detail else 'None'}")
|
|||
|
|
if user_detail:
|
|||
|
|
print(f" Msg: {user_detail.get('status_msg', 'Unknown error')}")
|
|||
|
|
print(" Hint: Please check if Cookie is expired or network is restricted.")
|
|||
|
|
return
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[ Error ]: An unexpected error occurred: {e}")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
user_data = user_detail.get("user", {})
|
|||
|
|
nickname = user_data.get('nickname', 'Unknown')
|
|||
|
|
|
|||
|
|
# 打印博主信息
|
|||
|
|
print("\n" + "="*60)
|
|||
|
|
print(f"Blogger: {nickname}")
|
|||
|
|
print("-" * 60)
|
|||
|
|
print(f"ID: {user_data.get('unique_id') or user_data.get('short_id', 'Unknown')}")
|
|||
|
|
print(f"Bio: {user_data.get('signature', 'N/A')}")
|
|||
|
|
print(f"Followers:{user_data.get('m_follower_count') or user_data.get('follower_count', 0)}")
|
|||
|
|
print(f"Likes: {user_data.get('total_favorited', 0)}")
|
|||
|
|
print(f"Following:{user_data.get('following_count', 0)}")
|
|||
|
|
print("="*60 + "\n")
|
|||
|
|
|
|||
|
|
# 步骤 3: 获取最近的作品列表
|
|||
|
|
print(f"[ Step 2/2 ]: Fetching recent works...")
|
|||
|
|
aweme_list = dy.getUserInfo(sec_uid, count=10)
|
|||
|
|
|
|||
|
|
result_data = {
|
|||
|
|
"user_info": user_data,
|
|||
|
|
"recent_videos": aweme_list or [],
|
|||
|
|
"crawl_time": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if aweme_list:
|
|||
|
|
print(f"Successfully fetched {len(aweme_list)} videos:\n")
|
|||
|
|
for i, aweme in enumerate(aweme_list, 1):
|
|||
|
|
ctime = aweme.get('create_time')
|
|||
|
|
if isinstance(ctime, (int, float)):
|
|||
|
|
ctime_str = datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M')
|
|||
|
|
else:
|
|||
|
|
ctime_str = str(ctime)
|
|||
|
|
|
|||
|
|
desc = aweme.get('desc', 'No Title')
|
|||
|
|
desc = (desc[:47] + "...") if len(desc) > 50 else desc
|
|||
|
|
|
|||
|
|
stats = aweme.get('statistics', {})
|
|||
|
|
print(f"{i:02d}. [{ctime_str}] {desc}")
|
|||
|
|
print(f" ❤️ {stats.get('digg_count', 0):<8} 💬 {stats.get('comment_count', 0):<8} ⭐ {stats.get('collect_count', 0)}")
|
|||
|
|
print(f" 🔗 https://www.douyin.com/video/{aweme.get('aweme_id')}\n")
|
|||
|
|
else:
|
|||
|
|
print("[ Info ]: No public videos found. Account might be private or API limited.")
|
|||
|
|
|
|||
|
|
# 步骤 4: 保存数据
|
|||
|
|
output_file = os.path.join(current_dir, f"user_data_{sec_uid[:8]}.json")
|
|||
|
|
try:
|
|||
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(result_data, f, ensure_ascii=False, indent=4)
|
|||
|
|
print(f"[ Success ]: Data saved to: {os.path.basename(output_file)}")
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"[ Warning ]: Failed to save data: {e}")
|
|||
|
|
|
|||
|
|
# =================================================================
|
|||
|
|
# 4. 主入口
|
|||
|
|
# =================================================================
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
# 提取 URL
|
|||
|
|
target_url = extract_url_from_file(current_file_path)
|
|||
|
|
|
|||
|
|
if target_url:
|
|||
|
|
# 配置环境
|
|||
|
|
config = load_config()
|
|||
|
|
setup_cookies(config)
|
|||
|
|
|
|||
|
|
# 执行爬取
|
|||
|
|
crawl_user_profile(target_url)
|
|||
|
|
else:
|
|||
|
|
print("[ Error ]: No valid Douyin URL found in file comments.")
|