Files
aiData/Controller/DouYinController.py

270 lines
11 KiB
Python
Raw Normal View History

2026-01-20 21:43:54 +08:00
import os
import logging
import uuid
import shutil
import subprocess
import asyncio
from datetime import datetime
from typing import List, Optional
2026-01-21 08:41:47 +08:00
from fastapi import APIRouter, HTTPException, BackgroundTasks
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
# Import custom modules
from Config.Config import OBS_CLOUD_PREFIX, OBS_BUCKET, OBS_TMP_PREFIX, OBS_SERVER
from Util.DouYinDownloader import DouYinDownloader
from Util.ObsUtil import ObsUploader
from Util.ASRClient import ASRClient
from Util.LlmUtil import get_llm_response
from Model.DouYinModel import DouYinModel
# Logger setup
logger = logging.getLogger(__name__)
router = APIRouter()
class ParseRequest(BaseModel):
text: str
class SummaryRequest(BaseModel):
ids: List[str] = []
async def process_video_task(url: str, request_id: str, share_text: str = ""):
logger.info(f"Processing task {request_id}")
model = DouYinModel()
# 1. Update status
await model.update_status(request_id, "PROCESSING")
temp_dir = os.path.abspath(f"temp_{request_id}")
try:
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
# 2. Parse & Download
downloader = DouYinDownloader()
# url is passed directly now
if not url:
raise Exception("No valid URL found")
logger.info(f"Downloading from {url}")
# Run download in thread to avoid blocking main loop
local_video_path, title = await asyncio.to_thread(downloader.download_video, url, temp_dir)
# Title handling strategy:
# Priority 1: Extracted from share text (if available and valid)
# Priority 2: Extracted from video download (often "Unknown Title")
# Priority 3: Generated by LLM (done later)
extracted_title = downloader.extract_title_from_text(share_text)
logger.info(f"Extracted title from text: {extracted_title}")
# If we have a valid extracted title, use it.
# But if we don't have a title yet (or it's Unknown), we definitely want to use extracted_title.
# Even if we have a title from yt-dlp, if it's just "Unknown Title", we prefer extracted one.
if extracted_title and extracted_title != "Unknown Title":
title = extracted_title
elif not title:
title = "Unknown Title"
if not local_video_path or not os.path.exists(local_video_path):
raise Exception("Download failed")
# 3. Upload Video to OBS (Long term storage)
logger.info("Uploading video to OBS...")
uploader = ObsUploader()
video_filename = os.path.basename(local_video_path)
obs_video_key = f"{OBS_CLOUD_PREFIX}/DouYin/{video_filename}"
success, _ = await asyncio.to_thread(uploader.upload_file, obs_video_key, local_video_path, OBS_BUCKET)
if not success:
raise Exception("OBS Upload failed")
# Construct public URL (Assuming standard OBS pattern or Config logic)
obs_url = f"https://{OBS_BUCKET}.{OBS_SERVER}/{obs_video_key}"
# 4. Convert to MP3
logger.info("Converting to MP3...")
mp3_path = os.path.splitext(local_video_path)[0] + ".mp3"
cmd = [
"ffmpeg", "-y", "-i", local_video_path,
"-acodec", "libmp3lame", "-ar", "16000", "-ac", "1", "-q:a", "2",
mp3_path
]
# Run ffmpeg in thread
result = await asyncio.to_thread(subprocess.run, cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
if result.returncode != 0:
raise Exception(f"FFmpeg failed: {result.stderr.decode()}")
# 5. ASR (Upload MP3 to tmp and transcribe)
logger.info("Transcribing...")
asr = ASRClient()
# Run ASR in thread
transcript = await asyncio.to_thread(asr.upload_and_transcribe_sync, mp3_path)
if not transcript:
raise Exception("Transcription failed (returned empty)")
# 6. LLM Title Generation (Enhancement)
# If the title is still Unknown or weak, OR if we just want to ensure we have a good title.
# The user said: "Alternatively, call LlmUtil.py to summarize title".
# Let's do it if title is Unknown or matches default filename pattern, OR if extracted title was also missing.
if (not title or title == "Unknown Title" or title == "Unknown"):
try:
logger.info("Generating title from transcript via LLM...")
prompt = f"请根据以下视频文案总结一个简短的标题20字以内不要包含任何解释性文字直接返回标题\n\n{transcript[:1000]}"
llm_title_chunks = []
# get_llm_response is already async
async for chunk in get_llm_response(prompt, stream=False):
llm_title_chunks.append(chunk)
llm_title = "".join(llm_title_chunks)
if llm_title:
# Clean up quotes if any
llm_title = llm_title.strip().strip('"').strip('').strip('')
logger.info(f"LLM generated title: {llm_title}")
# We overwrite the title if LLM succeeds
title = llm_title
except Exception as llm_e:
logger.warning(f"LLM Title generation failed: {llm_e}")
# 7. Save to DB (Update)
logger.info("Saving to DB...")
await model.update_record(request_id, title, obs_url, transcript, "COMPLETED")
logger.info(f"Task {request_id} completed successfully.")
except Exception as e:
logger.error(f"Task {request_id} failed: {e}", exc_info=True)
await model.update_status(request_id, "FAILED", str(e))
finally:
# 8. Cleanup
if os.path.exists(temp_dir):
try:
# shutil.rmtree is sync, wrap it
await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True)
except Exception as e:
logger.error(f"Cleanup failed: {e}")
@router.post("/api/parse")
async def parse(request: ParseRequest, background_tasks: BackgroundTasks):
downloader = DouYinDownloader()
urls = downloader.extract_urls(request.text)
if not urls:
# If no URLs found, try using the text as is (might be a direct link not caught by regex)
# But regex is quite broad. Let's just fail or try one.
# Let's assume text might be the URL if it's clean.
if request.text.startswith("http"):
urls = [request.text]
else:
raise HTTPException(status_code=400, detail="No valid URLs found")
created_ids = []
try:
model = DouYinModel()
for url in urls:
req_id = str(uuid.uuid4())
await model.insert_record(req_id, url)
created_ids.append(req_id)
# Pass request.text (the full share text) so we can extract title from it
background_tasks.add_task(process_video_task, url, req_id, request.text)
except Exception as e:
raise HTTPException(status_code=500, detail=f"DB Init Error: {e}")
return {"id": created_ids[0] if created_ids else None, "ids": created_ids, "status": "PENDING"}
@router.get("/api/records")
async def get_records():
try:
model = DouYinModel()
records = await model.get_records()
return records
except Exception as e:
logger.error(f"Get records error: {e}", exc_info=True)
return []
@router.delete("/api/records/{id}")
async def delete_record(id: str):
try:
model = DouYinModel()
await model.delete_record(id)
return {"status": "deleted"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/api/douyin/summary")
async def generate_summary(request: SummaryRequest):
try:
# Fetch transcripts
model = DouYinModel()
records = await model.get_transcripts(ids=request.ids)
if not records:
# If no records, just return a simple message stream
async def empty_stream():
yield "未找到可总结的已完成记录,请先解析视频。"
return StreamingResponse(empty_stream(), media_type="text/event-stream")
# Prepare text
full_text = ""
for r in records:
if r['transcript']:
full_text += f"【标题:{r['video_name']}\n内容:{r['transcript']}\n\n"
if not full_text:
async def empty_text_stream():
yield "记录中没有有效的文案内容。"
return StreamingResponse(empty_text_stream(), media_type="text/event-stream")
# Prompt
prompt = f"""
请对以下充电行业相关的视频内容进行知识精华提取
要求
1. 忽略无关闲聊和口语化表达
2. 按条目列出核心知识点不要长篇大论
3. 保持简洁专业只保留干货
4. 返回格式为Markdown列表
内容如下
{full_text[:15000]}
"""
# Limit context to avoid errors, 15000 chars is roughly safe for most models,
# but if using a small model, might need less. Assuming robust model.
return StreamingResponse(get_llm_response(prompt), media_type="text/event-stream")
except Exception as e:
logger.error(f"Summary generation failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
async def recover_pending_tasks():
"""
Check for tasks stuck in PENDING or PROCESSING state (due to server restart)
and restart them.
"""
logger.info("Scanning for interrupted Douyin tasks...")
try:
model = DouYinModel()
tasks = await model.get_interrupted_tasks()
if not tasks:
logger.info("No interrupted tasks found.")
return
logger.info(f"Found {len(tasks)} interrupted tasks. Restarting...")
for task in tasks:
req_id = task['id']
url = task['original_text']
# Restart task in background
# Note: We lost the original share text for title extraction,
# so we pass empty string. It will use the URL or 'Unknown Title'.
# If LLM is enabled, it might fix the title later.
asyncio.create_task(process_video_task(url, req_id, share_text=""))
except Exception as e:
2026-01-20 21:43:54 +08:00
logger.error(f"Failed to recover tasks: {e}", exc_info=True)