270 lines
11 KiB
Python
270 lines
11 KiB
Python
import os
|
||
import logging
|
||
import uuid
|
||
import shutil
|
||
import subprocess
|
||
import asyncio
|
||
from datetime import datetime
|
||
from typing import List, Optional
|
||
|
||
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
||
from fastapi.responses import StreamingResponse
|
||
from pydantic import BaseModel
|
||
|
||
# Import custom modules
|
||
from Config.Config import OBS_CLOUD_PREFIX, OBS_BUCKET, OBS_TMP_PREFIX, OBS_SERVER
|
||
from Util.DouYinDownloader import DouYinDownloader
|
||
from Util.ObsUtil import ObsUploader
|
||
from Util.ASRClient import ASRClient
|
||
from Util.LlmUtil import get_llm_response
|
||
from Model.DouYinModel import DouYinModel
|
||
|
||
# Logger setup
|
||
logger = logging.getLogger(__name__)
|
||
|
||
router = APIRouter()
|
||
|
||
class ParseRequest(BaseModel):
|
||
text: str
|
||
|
||
class SummaryRequest(BaseModel):
|
||
ids: List[str] = []
|
||
|
||
async def process_video_task(url: str, request_id: str, share_text: str = ""):
|
||
logger.info(f"Processing task {request_id}")
|
||
|
||
model = DouYinModel()
|
||
# 1. Update status
|
||
await model.update_status(request_id, "PROCESSING")
|
||
|
||
temp_dir = os.path.abspath(f"temp_{request_id}")
|
||
try:
|
||
if not os.path.exists(temp_dir):
|
||
os.makedirs(temp_dir)
|
||
|
||
# 2. Parse & Download
|
||
downloader = DouYinDownloader()
|
||
# url is passed directly now
|
||
if not url:
|
||
raise Exception("No valid URL found")
|
||
|
||
logger.info(f"Downloading from {url}")
|
||
# Run download in thread to avoid blocking main loop
|
||
local_video_path, title = await asyncio.to_thread(downloader.download_video, url, temp_dir)
|
||
|
||
# Title handling strategy:
|
||
# Priority 1: Extracted from share text (if available and valid)
|
||
# Priority 2: Extracted from video download (often "Unknown Title")
|
||
# Priority 3: Generated by LLM (done later)
|
||
|
||
extracted_title = downloader.extract_title_from_text(share_text)
|
||
logger.info(f"Extracted title from text: {extracted_title}")
|
||
|
||
# If we have a valid extracted title, use it.
|
||
# But if we don't have a title yet (or it's Unknown), we definitely want to use extracted_title.
|
||
# Even if we have a title from yt-dlp, if it's just "Unknown Title", we prefer extracted one.
|
||
if extracted_title and extracted_title != "Unknown Title":
|
||
title = extracted_title
|
||
elif not title:
|
||
title = "Unknown Title"
|
||
|
||
if not local_video_path or not os.path.exists(local_video_path):
|
||
raise Exception("Download failed")
|
||
|
||
# 3. Upload Video to OBS (Long term storage)
|
||
logger.info("Uploading video to OBS...")
|
||
uploader = ObsUploader()
|
||
video_filename = os.path.basename(local_video_path)
|
||
obs_video_key = f"{OBS_CLOUD_PREFIX}/DouYin/{video_filename}"
|
||
|
||
success, _ = await asyncio.to_thread(uploader.upload_file, obs_video_key, local_video_path, OBS_BUCKET)
|
||
if not success:
|
||
raise Exception("OBS Upload failed")
|
||
|
||
# Construct public URL (Assuming standard OBS pattern or Config logic)
|
||
obs_url = f"https://{OBS_BUCKET}.{OBS_SERVER}/{obs_video_key}"
|
||
|
||
# 4. Convert to MP3
|
||
logger.info("Converting to MP3...")
|
||
mp3_path = os.path.splitext(local_video_path)[0] + ".mp3"
|
||
cmd = [
|
||
"ffmpeg", "-y", "-i", local_video_path,
|
||
"-acodec", "libmp3lame", "-ar", "16000", "-ac", "1", "-q:a", "2",
|
||
mp3_path
|
||
]
|
||
# Run ffmpeg in thread
|
||
result = await asyncio.to_thread(subprocess.run, cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
|
||
if result.returncode != 0:
|
||
raise Exception(f"FFmpeg failed: {result.stderr.decode()}")
|
||
|
||
# 5. ASR (Upload MP3 to tmp and transcribe)
|
||
logger.info("Transcribing...")
|
||
asr = ASRClient()
|
||
# Run ASR in thread
|
||
transcript = await asyncio.to_thread(asr.upload_and_transcribe_sync, mp3_path)
|
||
|
||
if not transcript:
|
||
raise Exception("Transcription failed (returned empty)")
|
||
|
||
# 6. LLM Title Generation (Enhancement)
|
||
# If the title is still Unknown or weak, OR if we just want to ensure we have a good title.
|
||
# The user said: "Alternatively, call LlmUtil.py to summarize title".
|
||
# Let's do it if title is Unknown or matches default filename pattern, OR if extracted title was also missing.
|
||
if (not title or title == "Unknown Title" or title == "Unknown"):
|
||
try:
|
||
logger.info("Generating title from transcript via LLM...")
|
||
prompt = f"请根据以下视频文案总结一个简短的标题(20字以内),不要包含任何解释性文字,直接返回标题:\n\n{transcript[:1000]}"
|
||
|
||
llm_title_chunks = []
|
||
# get_llm_response is already async
|
||
async for chunk in get_llm_response(prompt, stream=False):
|
||
llm_title_chunks.append(chunk)
|
||
llm_title = "".join(llm_title_chunks)
|
||
|
||
if llm_title:
|
||
# Clean up quotes if any
|
||
llm_title = llm_title.strip().strip('"').strip('“').strip('”')
|
||
logger.info(f"LLM generated title: {llm_title}")
|
||
# We overwrite the title if LLM succeeds
|
||
title = llm_title
|
||
except Exception as llm_e:
|
||
logger.warning(f"LLM Title generation failed: {llm_e}")
|
||
|
||
# 7. Save to DB (Update)
|
||
logger.info("Saving to DB...")
|
||
await model.update_record(request_id, title, obs_url, transcript, "COMPLETED")
|
||
logger.info(f"Task {request_id} completed successfully.")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Task {request_id} failed: {e}", exc_info=True)
|
||
await model.update_status(request_id, "FAILED", str(e))
|
||
finally:
|
||
# 8. Cleanup
|
||
if os.path.exists(temp_dir):
|
||
try:
|
||
# shutil.rmtree is sync, wrap it
|
||
await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True)
|
||
except Exception as e:
|
||
logger.error(f"Cleanup failed: {e}")
|
||
|
||
@router.post("/api/parse")
|
||
async def parse(request: ParseRequest, background_tasks: BackgroundTasks):
|
||
downloader = DouYinDownloader()
|
||
urls = downloader.extract_urls(request.text)
|
||
|
||
if not urls:
|
||
# If no URLs found, try using the text as is (might be a direct link not caught by regex)
|
||
# But regex is quite broad. Let's just fail or try one.
|
||
# Let's assume text might be the URL if it's clean.
|
||
if request.text.startswith("http"):
|
||
urls = [request.text]
|
||
else:
|
||
raise HTTPException(status_code=400, detail="No valid URLs found")
|
||
|
||
created_ids = []
|
||
try:
|
||
model = DouYinModel()
|
||
for url in urls:
|
||
req_id = str(uuid.uuid4())
|
||
await model.insert_record(req_id, url)
|
||
created_ids.append(req_id)
|
||
# Pass request.text (the full share text) so we can extract title from it
|
||
background_tasks.add_task(process_video_task, url, req_id, request.text)
|
||
|
||
except Exception as e:
|
||
raise HTTPException(status_code=500, detail=f"DB Init Error: {e}")
|
||
|
||
return {"id": created_ids[0] if created_ids else None, "ids": created_ids, "status": "PENDING"}
|
||
|
||
@router.get("/api/records")
|
||
async def get_records():
|
||
try:
|
||
model = DouYinModel()
|
||
records = await model.get_records()
|
||
return records
|
||
except Exception as e:
|
||
logger.error(f"Get records error: {e}", exc_info=True)
|
||
return []
|
||
|
||
@router.delete("/api/records/{id}")
|
||
async def delete_record(id: str):
|
||
try:
|
||
model = DouYinModel()
|
||
await model.delete_record(id)
|
||
return {"status": "deleted"}
|
||
except Exception as e:
|
||
raise HTTPException(status_code=500, detail=str(e))
|
||
|
||
@router.post("/api/douyin/summary")
|
||
async def generate_summary(request: SummaryRequest):
|
||
try:
|
||
# Fetch transcripts
|
||
model = DouYinModel()
|
||
records = await model.get_transcripts(ids=request.ids)
|
||
|
||
if not records:
|
||
# If no records, just return a simple message stream
|
||
async def empty_stream():
|
||
yield "未找到可总结的已完成记录,请先解析视频。"
|
||
return StreamingResponse(empty_stream(), media_type="text/event-stream")
|
||
|
||
# Prepare text
|
||
full_text = ""
|
||
for r in records:
|
||
if r['transcript']:
|
||
full_text += f"【标题:{r['video_name']}】\n内容:{r['transcript']}\n\n"
|
||
|
||
if not full_text:
|
||
async def empty_text_stream():
|
||
yield "记录中没有有效的文案内容。"
|
||
return StreamingResponse(empty_text_stream(), media_type="text/event-stream")
|
||
|
||
# Prompt
|
||
prompt = f"""
|
||
请对以下充电行业相关的视频内容进行知识精华提取。
|
||
要求:
|
||
1. 忽略无关闲聊和口语化表达;
|
||
2. 按条目列出核心知识点,不要长篇大论;
|
||
3. 保持简洁专业,只保留干货;
|
||
4. 返回格式为Markdown列表。
|
||
|
||
内容如下:
|
||
{full_text[:15000]}
|
||
"""
|
||
|
||
# Limit context to avoid errors, 15000 chars is roughly safe for most models,
|
||
# but if using a small model, might need less. Assuming robust model.
|
||
|
||
return StreamingResponse(get_llm_response(prompt), media_type="text/event-stream")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Summary generation failed: {e}")
|
||
raise HTTPException(status_code=500, detail=str(e))
|
||
|
||
async def recover_pending_tasks():
|
||
"""
|
||
Check for tasks stuck in PENDING or PROCESSING state (due to server restart)
|
||
and restart them.
|
||
"""
|
||
logger.info("Scanning for interrupted Douyin tasks...")
|
||
try:
|
||
model = DouYinModel()
|
||
tasks = await model.get_interrupted_tasks()
|
||
|
||
if not tasks:
|
||
logger.info("No interrupted tasks found.")
|
||
return
|
||
|
||
logger.info(f"Found {len(tasks)} interrupted tasks. Restarting...")
|
||
for task in tasks:
|
||
req_id = task['id']
|
||
url = task['original_text']
|
||
# Restart task in background
|
||
# Note: We lost the original share text for title extraction,
|
||
# so we pass empty string. It will use the URL or 'Unknown Title'.
|
||
# If LLM is enabled, it might fix the title later.
|
||
asyncio.create_task(process_video_task(url, req_id, share_text=""))
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to recover tasks: {e}", exc_info=True)
|