Files
aiData/DouYin/transcribe_videos.py

69 lines
2.2 KiB
Python
Raw Normal View History

2026-01-20 19:06:36 +08:00
import os
import logging
import sys
2026-01-20 21:43:54 +08:00
import time
2026-01-20 19:06:36 +08:00
# Ensure project root is in path
sys.path.append(r"d:\dsWork\aiData")
from Util.ASRClient import ASRClient
# Configure logging
2026-01-20 21:43:54 +08:00
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stdout)
2026-01-20 19:06:36 +08:00
logger = logging.getLogger("Transcriber")
2026-01-20 21:43:54 +08:00
def transcribe_all():
2026-01-20 19:06:36 +08:00
audio_dir = r"d:\dsWork\aiData\DouYin\Audios"
transcript_dir = r"d:\dsWork\aiData\DouYin\Transcripts"
if not os.path.exists(transcript_dir):
os.makedirs(transcript_dir)
client = ASRClient()
files = [f for f in os.listdir(audio_dir) if f.endswith(".mp3")]
2026-01-20 21:43:54 +08:00
# Sort files to ensure deterministic order
files.sort()
2026-01-20 19:06:36 +08:00
logger.info(f"Found {len(files)} audio files.")
for filename in files:
audio_path = os.path.join(audio_dir, filename)
name, _ = os.path.splitext(filename)
txt_filename = f"{name}.txt"
txt_path = os.path.join(transcript_dir, txt_filename)
if os.path.exists(txt_path):
2026-01-20 19:16:55 +08:00
# Check if file is empty
if os.path.getsize(txt_path) > 0:
logger.info(f"Skipping (already exists): {txt_filename}")
continue
else:
logger.info(f"Re-processing empty file: {txt_filename}")
2026-01-20 19:06:36 +08:00
logger.info(f"Processing: {filename}")
try:
2026-01-20 21:43:54 +08:00
# Direct local file transcription using synchronous method
text = client.transcribe_file_sync(audio_path)
2026-01-20 19:06:36 +08:00
2026-01-20 19:16:55 +08:00
if text:
with open(txt_path, 'w', encoding='utf-8') as f:
f.write(text)
logger.info(f"Saved transcript to: {txt_filename}")
2026-01-20 19:06:36 +08:00
else:
2026-01-20 19:16:55 +08:00
logger.error(f"Failed to transcribe: {filename}")
2026-01-20 21:43:54 +08:00
# Add a small delay between files
time.sleep(1)
2026-01-20 19:06:36 +08:00
except Exception as e:
2026-01-20 19:16:55 +08:00
logger.error(f"Error processing {filename}: {str(e)}", exc_info=True)
2026-01-20 19:06:36 +08:00
if __name__ == "__main__":
2026-01-20 19:16:55 +08:00
try:
2026-01-20 21:43:54 +08:00
transcribe_all()
2026-01-20 19:16:55 +08:00
except KeyboardInterrupt:
logger.info("Stopped by user")
except Exception as e:
logger.error(f"Fatal error: {str(e)}", exc_info=True)