aiData/WeiXin/T6_VLM_Voice_Debug.py

# coding=utf-8
import asyncio
import logging
import os
import sys
import time

import cv2
import uiautomator2 as u2

# 添加项目根目录到 sys.path
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if project_root not in sys.path:
    sys.path.append(project_root)

from WeiXin.WxUtil import get_vlm_analysis
from Util.EasyOcrKit import EasyOcrKit

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("T6_Debug")

async def main():
    logger.info("🚀 T6 VLM 语音坐标调试工具启动...")
    
    # 连接设备
    try:
        d = u2.connect()
        logger.info(f"设备已连接: {d.info.get('serial')}")
    except Exception as e:
        logger.error(f"设备连接失败: {e}")
        return

    # 截图目录
    screenshots_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots")
    if not os.path.exists(screenshots_dir):
        os.makedirs(screenshots_dir)

    # 截图
    screenshot_path = os.path.join(screenshots_dir, "t6_debug_temp.jpg")
    logger.info("📸 正在截图...")
    d.screenshot(screenshot_path)
    
    # 调用 VLM 分析
    logger.info("🧠 正在调用 VLM 分析图片...")
    result_data = await get_vlm_analysis(screenshot_path)
    
    if not result_data:
        logger.error("❌ VLM 分析返回为空")
        return

    logger.info(f"VLM 返回结果: {result_data}")

    # 读取图片用于绘制
    img = cv2.imread(screenshot_path)
    if img is None:
        logger.error("❌ 无法读取截图文件")
        return

    messages = result_data.get("messages", [])
    voice_count = 0
    
    for msg in messages:
        msg_type = msg.get("type")
        content = msg.get("content")
        coords = msg.get("coordinates") or msg.get("center")
        
        if not coords:
            continue
            
        x, y = coords
        
        if msg_type == "voice":
            voice_count += 1
            logger.info(f"🎤 发现语音消息: {content}, 坐标: ({x}, {y})")
            
            # 绘制绿框 (语音)
            w, h = 300, 80
            top_left = (int(x - w/2), int(y - h/2))
            bottom_right = (int(x + w/2), int(y + h/2))
            
            cv2.rectangle(img, top_left, bottom_right, (0, 255, 0), 3)
            cv2.circle(img, (int(x), int(y)), 5, (0, 0, 255), -1)
            label = f"Voice ({x},{y})"
            cv2.putText(img, label, (top_left[0], top_left[1] - 10), 
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
                        
    # 保存结果图片
    output_path = os.path.join(screenshots_dir, "t6_debug_result.jpg")
    cv2.imwrite(output_path, img)
    logger.info(f"✅ 结果已保存至: {output_path}")
    logger.info(f"共标记了 {voice_count} 条语音消息。请检查图片是否准确。")

    # --- 验证转文字功能 (处理最后一条未转换语音) ---
    logger.info("="*30)
    logger.info("🔍 开始验证“转文字”功能 (仅针对最后一条未转换语音)...")
    
    # 筛选未转换的语音
    unconverted_voices = []
    for msg in messages:
        if msg.get("type") == "voice" and msg.get("status") == "unconverted":
             coords = msg.get("coordinates") or msg.get("center")
             if coords:
                 msg["coordinates"] = coords
                 unconverted_voices.append(msg)
    
    if not unconverted_voices:
        logger.info("⚠️ 没有发现未转换的语音消息，跳过验证。")
    else:
        last_voice = unconverted_voices[-1]
        vx, vy = last_voice['coordinates']
        content = last_voice.get('content', '0"')
        logger.info(f"🎯 目标语音: {content}, 坐标: ({vx}, {vy})")
        
        # 1. 长按
        logger.info(f"👆 长按语音消息...")
        d.long_click(vx, vy, 1.5)
        time.sleep(1.0)
        
        # 2. 截图菜单
        menu_shot_path = os.path.join(screenshots_dir, "t6_menu_shot.jpg")
        logger.info(f"📸 截取菜单: {menu_shot_path}")
        d.screenshot(menu_shot_path)
        
        # 3. OCR 识别
        logger.info("🧠 正在进行 OCR 识别菜单...")
        ocr_kit = EasyOcrKit()
        ocr_results = ocr_kit.read_text(menu_shot_path)
        
        convert_btn_center = None
        for bbox, text, conf in ocr_results:
            if "转文字" in text or "转换为文字" in text:
                c_x = int((bbox[0][0] + bbox[2][0]) / 2)
                c_y = int((bbox[0][1] + bbox[2][1]) / 2)
                convert_btn_center = (c_x, c_y)
                logger.info(f"✅ OCR 找到 '{text}' 按钮: {convert_btn_center}")
                break
        
        if convert_btn_center:
            # 4. 点击转文字
            logger.info(f"👆 点击转文字按钮: {convert_btn_center}")
            d.click(convert_btn_center[0], convert_btn_center[1])
            
            # 5. 动态等待
            duration_str = content.replace('"', '').strip()
            try:
                duration = int(duration_str)
            except:
                duration = 10
            
            wait_seconds = max(2, duration / 5.0)
            logger.info(f"⏳ 语音时长 {duration}s，模拟等待 {wait_seconds:.1f}s...")
            time.sleep(wait_seconds)
            logger.info("✅ 流程执行完毕！请检查手机屏幕是否已开始转换。")
        else:
            logger.error("❌ OCR 未找到 '转文字' 按钮！")
            # 点击空白处关闭
            d.click(vx + 200, vy)

if __name__ == "__main__":
    if sys.platform.startswith('win'):
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    asyncio.run(main())