aiData/test_title_extraction.py


import sys
import os
import re

# Mock class since we can't import easily without full env
class DouYinDownloaderMock:
    def extract_title_from_text(self, text: str) -> str:
        """
        Extract title from share text by removing URLs and common prefixes
        """
        # 1. Remove URLs
        clean_text = re.sub(r'http[s]?://\S+', '', text)

        # 2. Remove "Copy open Douyin..." prefix patterns
        # Example: "3.00 12/28 d@n.dN VYZ:/ 复制打开抖音极速版，看看【聚合能研的作品】..."
        # Pattern: Any chars + "复制打开抖音" + any chars + "，看看"
        clean_text = re.sub(r'.*?复制打开抖音.*?，看看', '', clean_text)

        # 3. Remove 【...】 if it's at the start (usually author name)
        clean_text = re.sub(r'^\s*【.*?】', '', clean_text)

        # 4. Clean up whitespace
        clean_text = clean_text.strip()

        return clean_text if clean_text else "Unknown Title"

def test():
    downloader = DouYinDownloaderMock()

    # Case 1: User example
    text1 = "3.00 12/28 d@n.dN VYZ:/ 复制打开抖音极速版，看看【聚合能研的作品】2026年，电力市场的 “大洗牌” 正式开始 告别... https://v.douyin.com/gHWfWVgDVRo/"
    title1 = downloader.extract_title_from_text(text1)
    print(f"Input 1: {text1}")
    print(f"Title 1: {title1}")
    print("-" * 20)

    # Case 2: Pure text
    text2 = "2026年，电力市场的 “大洗牌” 正式开始 告别..."
    title2 = downloader.extract_title_from_text(text2)
    print(f"Input 2: {text2}")
    print(f"Title 2: {title2}")
    print("-" * 20)

    # Case 3: Text with URL only
    text3 = "Check this out https://v.douyin.com/abc/"
    title3 = downloader.extract_title_from_text(text3)
    print(f"Input 3: {text3}")
    print(f"Title 3: {title3}")
    print("-" * 20)

if __name__ == "__main__":
    test()