2026-01-20 19:06:36 +08:00
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import logging
|
|
|
|
|
import sys
|
2026-01-20 21:43:54 +08:00
|
|
|
import time
|
2026-01-20 19:06:36 +08:00
|
|
|
|
|
|
|
|
# Ensure project root is in path
|
|
|
|
|
sys.path.append(r"d:\dsWork\aiData")
|
|
|
|
|
from Util.ASRClient import ASRClient
|
|
|
|
|
|
|
|
|
|
# Configure logging
|
2026-01-20 21:43:54 +08:00
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stdout)
|
2026-01-20 19:06:36 +08:00
|
|
|
logger = logging.getLogger("Transcriber")
|
|
|
|
|
|
2026-01-20 21:43:54 +08:00
|
|
|
def transcribe_all():
|
2026-01-20 19:06:36 +08:00
|
|
|
audio_dir = r"d:\dsWork\aiData\DouYin\Audios"
|
|
|
|
|
transcript_dir = r"d:\dsWork\aiData\DouYin\Transcripts"
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(transcript_dir):
|
|
|
|
|
os.makedirs(transcript_dir)
|
|
|
|
|
|
|
|
|
|
client = ASRClient()
|
|
|
|
|
|
|
|
|
|
files = [f for f in os.listdir(audio_dir) if f.endswith(".mp3")]
|
2026-01-20 21:43:54 +08:00
|
|
|
# Sort files to ensure deterministic order
|
|
|
|
|
files.sort()
|
2026-01-20 19:06:36 +08:00
|
|
|
logger.info(f"Found {len(files)} audio files.")
|
|
|
|
|
|
|
|
|
|
for filename in files:
|
|
|
|
|
audio_path = os.path.join(audio_dir, filename)
|
|
|
|
|
name, _ = os.path.splitext(filename)
|
|
|
|
|
txt_filename = f"{name}.txt"
|
|
|
|
|
txt_path = os.path.join(transcript_dir, txt_filename)
|
|
|
|
|
|
|
|
|
|
if os.path.exists(txt_path):
|
2026-01-20 19:16:55 +08:00
|
|
|
# Check if file is empty
|
|
|
|
|
if os.path.getsize(txt_path) > 0:
|
|
|
|
|
logger.info(f"Skipping (already exists): {txt_filename}")
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
logger.info(f"Re-processing empty file: {txt_filename}")
|
2026-01-20 19:06:36 +08:00
|
|
|
|
|
|
|
|
logger.info(f"Processing: {filename}")
|
|
|
|
|
|
|
|
|
|
try:
|
2026-01-20 21:43:54 +08:00
|
|
|
# Direct local file transcription using synchronous method
|
|
|
|
|
text = client.transcribe_file_sync(audio_path)
|
2026-01-20 19:06:36 +08:00
|
|
|
|
2026-01-20 19:16:55 +08:00
|
|
|
if text:
|
|
|
|
|
with open(txt_path, 'w', encoding='utf-8') as f:
|
|
|
|
|
f.write(text)
|
|
|
|
|
logger.info(f"Saved transcript to: {txt_filename}")
|
2026-01-20 19:06:36 +08:00
|
|
|
else:
|
2026-01-20 19:16:55 +08:00
|
|
|
logger.error(f"Failed to transcribe: {filename}")
|
2026-01-20 21:43:54 +08:00
|
|
|
|
|
|
|
|
# Add a small delay between files
|
|
|
|
|
time.sleep(1)
|
2026-01-20 19:06:36 +08:00
|
|
|
|
|
|
|
|
except Exception as e:
|
2026-01-20 19:16:55 +08:00
|
|
|
logger.error(f"Error processing {filename}: {str(e)}", exc_info=True)
|
2026-01-20 19:06:36 +08:00
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2026-01-20 19:16:55 +08:00
|
|
|
try:
|
2026-01-20 21:43:54 +08:00
|
|
|
transcribe_all()
|
2026-01-20 19:16:55 +08:00
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
logger.info("Stopped by user")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Fatal error: {str(e)}", exc_info=True)
|