Files
aiData/test_title_extraction.py
HuangHai 55e88777d9 'commit'
2026-01-20 21:43:54 +08:00

54 lines
1.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import sys
import os
import re
# Mock class since we can't import easily without full env
class DouYinDownloaderMock:
def extract_title_from_text(self, text: str) -> str:
"""
Extract title from share text by removing URLs and common prefixes
"""
# 1. Remove URLs
clean_text = re.sub(r'http[s]?://\S+', '', text)
# 2. Remove "Copy open Douyin..." prefix patterns
# Example: "3.00 12/28 d@n.dN VYZ:/ 复制打开抖音极速版,看看【聚合能研的作品】..."
# Pattern: Any chars + "复制打开抖音" + any chars + ",看看"
clean_text = re.sub(r'.*?复制打开抖音.*?,看看', '', clean_text)
# 3. Remove 【...】 if it's at the start (usually author name)
clean_text = re.sub(r'^\s*【.*?】', '', clean_text)
# 4. Clean up whitespace
clean_text = clean_text.strip()
return clean_text if clean_text else "Unknown Title"
def test():
downloader = DouYinDownloaderMock()
# Case 1: User example
text1 = "3.00 12/28 d@n.dN VYZ:/ 复制打开抖音极速版看看【聚合能研的作品】2026年电力市场的 “大洗牌” 正式开始 告别... https://v.douyin.com/gHWfWVgDVRo/"
title1 = downloader.extract_title_from_text(text1)
print(f"Input 1: {text1}")
print(f"Title 1: {title1}")
print("-" * 20)
# Case 2: Pure text
text2 = "2026年电力市场的 “大洗牌” 正式开始 告别..."
title2 = downloader.extract_title_from_text(text2)
print(f"Input 2: {text2}")
print(f"Title 2: {title2}")
print("-" * 20)
# Case 3: Text with URL only
text3 = "Check this out https://v.douyin.com/abc/"
title3 = downloader.extract_title_from_text(text3)
print(f"Input 3: {text3}")
print(f"Title 3: {title3}")
print("-" * 20)
if __name__ == "__main__":
test()