Files
aiData/Controller/DouYinController.py
HuangHai e51dc18d06 'commit'
2026-01-21 08:41:47 +08:00

270 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import logging
import uuid
import shutil
import subprocess
import asyncio
from datetime import datetime
from typing import List, Optional
from fastapi import APIRouter, HTTPException, BackgroundTasks
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
# Import custom modules
from Config.Config import OBS_CLOUD_PREFIX, OBS_BUCKET, OBS_TMP_PREFIX, OBS_SERVER
from Util.DouYinDownloader import DouYinDownloader
from Util.ObsUtil import ObsUploader
from Util.ASRClient import ASRClient
from Util.LlmUtil import get_llm_response
from Model.DouYinModel import DouYinModel
# Logger setup
logger = logging.getLogger(__name__)
router = APIRouter()
class ParseRequest(BaseModel):
text: str
class SummaryRequest(BaseModel):
ids: List[str] = []
async def process_video_task(url: str, request_id: str, share_text: str = ""):
logger.info(f"Processing task {request_id}")
model = DouYinModel()
# 1. Update status
await model.update_status(request_id, "PROCESSING")
temp_dir = os.path.abspath(f"temp_{request_id}")
try:
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
# 2. Parse & Download
downloader = DouYinDownloader()
# url is passed directly now
if not url:
raise Exception("No valid URL found")
logger.info(f"Downloading from {url}")
# Run download in thread to avoid blocking main loop
local_video_path, title = await asyncio.to_thread(downloader.download_video, url, temp_dir)
# Title handling strategy:
# Priority 1: Extracted from share text (if available and valid)
# Priority 2: Extracted from video download (often "Unknown Title")
# Priority 3: Generated by LLM (done later)
extracted_title = downloader.extract_title_from_text(share_text)
logger.info(f"Extracted title from text: {extracted_title}")
# If we have a valid extracted title, use it.
# But if we don't have a title yet (or it's Unknown), we definitely want to use extracted_title.
# Even if we have a title from yt-dlp, if it's just "Unknown Title", we prefer extracted one.
if extracted_title and extracted_title != "Unknown Title":
title = extracted_title
elif not title:
title = "Unknown Title"
if not local_video_path or not os.path.exists(local_video_path):
raise Exception("Download failed")
# 3. Upload Video to OBS (Long term storage)
logger.info("Uploading video to OBS...")
uploader = ObsUploader()
video_filename = os.path.basename(local_video_path)
obs_video_key = f"{OBS_CLOUD_PREFIX}/DouYin/{video_filename}"
success, _ = await asyncio.to_thread(uploader.upload_file, obs_video_key, local_video_path, OBS_BUCKET)
if not success:
raise Exception("OBS Upload failed")
# Construct public URL (Assuming standard OBS pattern or Config logic)
obs_url = f"https://{OBS_BUCKET}.{OBS_SERVER}/{obs_video_key}"
# 4. Convert to MP3
logger.info("Converting to MP3...")
mp3_path = os.path.splitext(local_video_path)[0] + ".mp3"
cmd = [
"ffmpeg", "-y", "-i", local_video_path,
"-acodec", "libmp3lame", "-ar", "16000", "-ac", "1", "-q:a", "2",
mp3_path
]
# Run ffmpeg in thread
result = await asyncio.to_thread(subprocess.run, cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
if result.returncode != 0:
raise Exception(f"FFmpeg failed: {result.stderr.decode()}")
# 5. ASR (Upload MP3 to tmp and transcribe)
logger.info("Transcribing...")
asr = ASRClient()
# Run ASR in thread
transcript = await asyncio.to_thread(asr.upload_and_transcribe_sync, mp3_path)
if not transcript:
raise Exception("Transcription failed (returned empty)")
# 6. LLM Title Generation (Enhancement)
# If the title is still Unknown or weak, OR if we just want to ensure we have a good title.
# The user said: "Alternatively, call LlmUtil.py to summarize title".
# Let's do it if title is Unknown or matches default filename pattern, OR if extracted title was also missing.
if (not title or title == "Unknown Title" or title == "Unknown"):
try:
logger.info("Generating title from transcript via LLM...")
prompt = f"请根据以下视频文案总结一个简短的标题20字以内不要包含任何解释性文字直接返回标题\n\n{transcript[:1000]}"
llm_title_chunks = []
# get_llm_response is already async
async for chunk in get_llm_response(prompt, stream=False):
llm_title_chunks.append(chunk)
llm_title = "".join(llm_title_chunks)
if llm_title:
# Clean up quotes if any
llm_title = llm_title.strip().strip('"').strip('').strip('')
logger.info(f"LLM generated title: {llm_title}")
# We overwrite the title if LLM succeeds
title = llm_title
except Exception as llm_e:
logger.warning(f"LLM Title generation failed: {llm_e}")
# 7. Save to DB (Update)
logger.info("Saving to DB...")
await model.update_record(request_id, title, obs_url, transcript, "COMPLETED")
logger.info(f"Task {request_id} completed successfully.")
except Exception as e:
logger.error(f"Task {request_id} failed: {e}", exc_info=True)
await model.update_status(request_id, "FAILED", str(e))
finally:
# 8. Cleanup
if os.path.exists(temp_dir):
try:
# shutil.rmtree is sync, wrap it
await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True)
except Exception as e:
logger.error(f"Cleanup failed: {e}")
@router.post("/api/parse")
async def parse(request: ParseRequest, background_tasks: BackgroundTasks):
downloader = DouYinDownloader()
urls = downloader.extract_urls(request.text)
if not urls:
# If no URLs found, try using the text as is (might be a direct link not caught by regex)
# But regex is quite broad. Let's just fail or try one.
# Let's assume text might be the URL if it's clean.
if request.text.startswith("http"):
urls = [request.text]
else:
raise HTTPException(status_code=400, detail="No valid URLs found")
created_ids = []
try:
model = DouYinModel()
for url in urls:
req_id = str(uuid.uuid4())
await model.insert_record(req_id, url)
created_ids.append(req_id)
# Pass request.text (the full share text) so we can extract title from it
background_tasks.add_task(process_video_task, url, req_id, request.text)
except Exception as e:
raise HTTPException(status_code=500, detail=f"DB Init Error: {e}")
return {"id": created_ids[0] if created_ids else None, "ids": created_ids, "status": "PENDING"}
@router.get("/api/records")
async def get_records():
try:
model = DouYinModel()
records = await model.get_records()
return records
except Exception as e:
logger.error(f"Get records error: {e}", exc_info=True)
return []
@router.delete("/api/records/{id}")
async def delete_record(id: str):
try:
model = DouYinModel()
await model.delete_record(id)
return {"status": "deleted"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/api/douyin/summary")
async def generate_summary(request: SummaryRequest):
try:
# Fetch transcripts
model = DouYinModel()
records = await model.get_transcripts(ids=request.ids)
if not records:
# If no records, just return a simple message stream
async def empty_stream():
yield "未找到可总结的已完成记录,请先解析视频。"
return StreamingResponse(empty_stream(), media_type="text/event-stream")
# Prepare text
full_text = ""
for r in records:
if r['transcript']:
full_text += f"【标题:{r['video_name']}\n内容:{r['transcript']}\n\n"
if not full_text:
async def empty_text_stream():
yield "记录中没有有效的文案内容。"
return StreamingResponse(empty_text_stream(), media_type="text/event-stream")
# Prompt
prompt = f"""
请对以下充电行业相关的视频内容进行知识精华提取。
要求:
1. 忽略无关闲聊和口语化表达;
2. 按条目列出核心知识点,不要长篇大论;
3. 保持简洁专业,只保留干货;
4. 返回格式为Markdown列表。
内容如下:
{full_text[:15000]}
"""
# Limit context to avoid errors, 15000 chars is roughly safe for most models,
# but if using a small model, might need less. Assuming robust model.
return StreamingResponse(get_llm_response(prompt), media_type="text/event-stream")
except Exception as e:
logger.error(f"Summary generation failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
async def recover_pending_tasks():
"""
Check for tasks stuck in PENDING or PROCESSING state (due to server restart)
and restart them.
"""
logger.info("Scanning for interrupted Douyin tasks...")
try:
model = DouYinModel()
tasks = await model.get_interrupted_tasks()
if not tasks:
logger.info("No interrupted tasks found.")
return
logger.info(f"Found {len(tasks)} interrupted tasks. Restarting...")
for task in tasks:
req_id = task['id']
url = task['original_text']
# Restart task in background
# Note: We lost the original share text for title extraction,
# so we pass empty string. It will use the URL or 'Unknown Title'.
# If LLM is enabled, it might fix the title later.
asyncio.create_task(process_video_task(url, req_id, share_text=""))
except Exception as e:
logger.error(f"Failed to recover tasks: {e}", exc_info=True)