import os import logging import uuid import shutil import subprocess import asyncio from datetime import datetime from typing import List, Optional from fastapi import APIRouter, HTTPException, BackgroundTasks from fastapi.responses import StreamingResponse from pydantic import BaseModel # Import custom modules from Config.Config import OBS_CLOUD_PREFIX, OBS_BUCKET, OBS_TMP_PREFIX, OBS_SERVER from Util.DouYinDownloader import DouYinDownloader from Util.ObsUtil import ObsUploader from Util.ASRClient import ASRClient from Util.LlmUtil import get_llm_response from Model.DouYinModel import DouYinModel # Logger setup logger = logging.getLogger(__name__) router = APIRouter() class ParseRequest(BaseModel): text: str class SummaryRequest(BaseModel): ids: List[str] = [] async def process_video_task(url: str, request_id: str, share_text: str = ""): logger.info(f"Processing task {request_id}") model = DouYinModel() # 1. Update status await model.update_status(request_id, "PROCESSING") temp_dir = os.path.abspath(f"temp_{request_id}") try: if not os.path.exists(temp_dir): os.makedirs(temp_dir) # 2. Parse & Download downloader = DouYinDownloader() # url is passed directly now if not url: raise Exception("No valid URL found") logger.info(f"Downloading from {url}") # Run download in thread to avoid blocking main loop local_video_path, title = await asyncio.to_thread(downloader.download_video, url, temp_dir) # Title handling strategy: # Priority 1: Extracted from share text (if available and valid) # Priority 2: Extracted from video download (often "Unknown Title") # Priority 3: Generated by LLM (done later) extracted_title = downloader.extract_title_from_text(share_text) logger.info(f"Extracted title from text: {extracted_title}") # If we have a valid extracted title, use it. # But if we don't have a title yet (or it's Unknown), we definitely want to use extracted_title. # Even if we have a title from yt-dlp, if it's just "Unknown Title", we prefer extracted one. if extracted_title and extracted_title != "Unknown Title": title = extracted_title elif not title: title = "Unknown Title" if not local_video_path or not os.path.exists(local_video_path): raise Exception("Download failed") # 3. Upload Video to OBS (Long term storage) logger.info("Uploading video to OBS...") uploader = ObsUploader() video_filename = os.path.basename(local_video_path) obs_video_key = f"{OBS_CLOUD_PREFIX}/DouYin/{video_filename}" success, _ = await asyncio.to_thread(uploader.upload_file, obs_video_key, local_video_path, OBS_BUCKET) if not success: raise Exception("OBS Upload failed") # Construct public URL (Assuming standard OBS pattern or Config logic) obs_url = f"https://{OBS_BUCKET}.{OBS_SERVER}/{obs_video_key}" # 4. Convert to MP3 logger.info("Converting to MP3...") mp3_path = os.path.splitext(local_video_path)[0] + ".mp3" cmd = [ "ffmpeg", "-y", "-i", local_video_path, "-acodec", "libmp3lame", "-ar", "16000", "-ac", "1", "-q:a", "2", mp3_path ] # Run ffmpeg in thread result = await asyncio.to_thread(subprocess.run, cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) if result.returncode != 0: raise Exception(f"FFmpeg failed: {result.stderr.decode()}") # 5. ASR (Upload MP3 to tmp and transcribe) logger.info("Transcribing...") asr = ASRClient() # Run ASR in thread transcript = await asyncio.to_thread(asr.upload_and_transcribe_sync, mp3_path) if not transcript: raise Exception("Transcription failed (returned empty)") # 6. LLM Title Generation (Enhancement) # If the title is still Unknown or weak, OR if we just want to ensure we have a good title. # The user said: "Alternatively, call LlmUtil.py to summarize title". # Let's do it if title is Unknown or matches default filename pattern, OR if extracted title was also missing. if (not title or title == "Unknown Title" or title == "Unknown"): try: logger.info("Generating title from transcript via LLM...") prompt = f"请根据以下视频文案总结一个简短的标题(20字以内),不要包含任何解释性文字,直接返回标题:\n\n{transcript[:1000]}" llm_title_chunks = [] # get_llm_response is already async async for chunk in get_llm_response(prompt, stream=False): llm_title_chunks.append(chunk) llm_title = "".join(llm_title_chunks) if llm_title: # Clean up quotes if any llm_title = llm_title.strip().strip('"').strip('“').strip('”') logger.info(f"LLM generated title: {llm_title}") # We overwrite the title if LLM succeeds title = llm_title except Exception as llm_e: logger.warning(f"LLM Title generation failed: {llm_e}") # 7. Save to DB (Update) logger.info("Saving to DB...") await model.update_record(request_id, title, obs_url, transcript, "COMPLETED") logger.info(f"Task {request_id} completed successfully.") except Exception as e: logger.error(f"Task {request_id} failed: {e}", exc_info=True) await model.update_status(request_id, "FAILED", str(e)) finally: # 8. Cleanup if os.path.exists(temp_dir): try: # shutil.rmtree is sync, wrap it await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True) except Exception as e: logger.error(f"Cleanup failed: {e}") @router.post("/api/parse") async def parse(request: ParseRequest, background_tasks: BackgroundTasks): downloader = DouYinDownloader() urls = downloader.extract_urls(request.text) if not urls: # If no URLs found, try using the text as is (might be a direct link not caught by regex) # But regex is quite broad. Let's just fail or try one. # Let's assume text might be the URL if it's clean. if request.text.startswith("http"): urls = [request.text] else: raise HTTPException(status_code=400, detail="No valid URLs found") created_ids = [] try: model = DouYinModel() for url in urls: req_id = str(uuid.uuid4()) await model.insert_record(req_id, url) created_ids.append(req_id) # Pass request.text (the full share text) so we can extract title from it background_tasks.add_task(process_video_task, url, req_id, request.text) except Exception as e: raise HTTPException(status_code=500, detail=f"DB Init Error: {e}") return {"id": created_ids[0] if created_ids else None, "ids": created_ids, "status": "PENDING"} @router.get("/api/records") async def get_records(): try: model = DouYinModel() records = await model.get_records() return records except Exception as e: logger.error(f"Get records error: {e}", exc_info=True) return [] @router.delete("/api/records/{id}") async def delete_record(id: str): try: model = DouYinModel() await model.delete_record(id) return {"status": "deleted"} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @router.post("/api/douyin/summary") async def generate_summary(request: SummaryRequest): try: # Fetch transcripts model = DouYinModel() records = await model.get_transcripts(ids=request.ids) if not records: # If no records, just return a simple message stream async def empty_stream(): yield "未找到可总结的已完成记录,请先解析视频。" return StreamingResponse(empty_stream(), media_type="text/event-stream") # Prepare text full_text = "" for r in records: if r['transcript']: full_text += f"【标题:{r['video_name']}】\n内容:{r['transcript']}\n\n" if not full_text: async def empty_text_stream(): yield "记录中没有有效的文案内容。" return StreamingResponse(empty_text_stream(), media_type="text/event-stream") # Prompt prompt = f""" 请对以下充电行业相关的视频内容进行知识精华提取。 要求: 1. 忽略无关闲聊和口语化表达; 2. 按条目列出核心知识点,不要长篇大论; 3. 保持简洁专业,只保留干货; 4. 返回格式为Markdown列表。 内容如下: {full_text[:15000]} """ # Limit context to avoid errors, 15000 chars is roughly safe for most models, # but if using a small model, might need less. Assuming robust model. return StreamingResponse(get_llm_response(prompt), media_type="text/event-stream") except Exception as e: logger.error(f"Summary generation failed: {e}") raise HTTPException(status_code=500, detail=str(e)) async def recover_pending_tasks(): """ Check for tasks stuck in PENDING or PROCESSING state (due to server restart) and restart them. """ logger.info("Scanning for interrupted Douyin tasks...") try: model = DouYinModel() tasks = await model.get_interrupted_tasks() if not tasks: logger.info("No interrupted tasks found.") return logger.info(f"Found {len(tasks)} interrupted tasks. Restarting...") for task in tasks: req_id = task['id'] url = task['original_text'] # Restart task in background # Note: We lost the original share text for title extraction, # so we pass empty string. It will use the URL or 'Unknown Title'. # If LLM is enabled, it might fix the title later. asyncio.create_task(process_video_task(url, req_id, share_text="")) except Exception as e: logger.error(f"Failed to recover tasks: {e}", exc_info=True)