Files
aiData/WeiXin/T6_VLM_Voice_Debug.py
HuangHai bf485d10f1 'commit'
2026-01-25 12:52:52 +08:00

164 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
import asyncio
import logging
import os
import sys
import time
import cv2
import uiautomator2 as u2
# 添加项目根目录到 sys.path
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if project_root not in sys.path:
sys.path.append(project_root)
from WeiXin.WxUtil import get_vlm_analysis
from Util.EasyOcrKit import EasyOcrKit
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("T6_Debug")
async def main():
logger.info("🚀 T6 VLM 语音坐标调试工具启动...")
# 连接设备
try:
d = u2.connect()
logger.info(f"设备已连接: {d.info.get('serial')}")
except Exception as e:
logger.error(f"设备连接失败: {e}")
return
# 截图目录
screenshots_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots")
if not os.path.exists(screenshots_dir):
os.makedirs(screenshots_dir)
# 截图
screenshot_path = os.path.join(screenshots_dir, "t6_debug_temp.jpg")
logger.info("📸 正在截图...")
d.screenshot(screenshot_path)
# 调用 VLM 分析
logger.info("🧠 正在调用 VLM 分析图片...")
result_data = await get_vlm_analysis(screenshot_path)
if not result_data:
logger.error("❌ VLM 分析返回为空")
return
logger.info(f"VLM 返回结果: {result_data}")
# 读取图片用于绘制
img = cv2.imread(screenshot_path)
if img is None:
logger.error("❌ 无法读取截图文件")
return
messages = result_data.get("messages", [])
voice_count = 0
for msg in messages:
msg_type = msg.get("type")
content = msg.get("content")
coords = msg.get("coordinates") or msg.get("center")
if not coords:
continue
x, y = coords
if msg_type == "voice":
voice_count += 1
logger.info(f"🎤 发现语音消息: {content}, 坐标: ({x}, {y})")
# 绘制绿框 (语音)
w, h = 300, 80
top_left = (int(x - w/2), int(y - h/2))
bottom_right = (int(x + w/2), int(y + h/2))
cv2.rectangle(img, top_left, bottom_right, (0, 255, 0), 3)
cv2.circle(img, (int(x), int(y)), 5, (0, 0, 255), -1)
label = f"Voice ({x},{y})"
cv2.putText(img, label, (top_left[0], top_left[1] - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
# 保存结果图片
output_path = os.path.join(screenshots_dir, "t6_debug_result.jpg")
cv2.imwrite(output_path, img)
logger.info(f"✅ 结果已保存至: {output_path}")
logger.info(f"共标记了 {voice_count} 条语音消息。请检查图片是否准确。")
# --- 验证转文字功能 (处理最后一条未转换语音) ---
logger.info("="*30)
logger.info("🔍 开始验证“转文字”功能 (仅针对最后一条未转换语音)...")
# 筛选未转换的语音
unconverted_voices = []
for msg in messages:
if msg.get("type") == "voice" and msg.get("status") == "unconverted":
coords = msg.get("coordinates") or msg.get("center")
if coords:
msg["coordinates"] = coords
unconverted_voices.append(msg)
if not unconverted_voices:
logger.info("⚠️ 没有发现未转换的语音消息,跳过验证。")
else:
last_voice = unconverted_voices[-1]
vx, vy = last_voice['coordinates']
content = last_voice.get('content', '0"')
logger.info(f"🎯 目标语音: {content}, 坐标: ({vx}, {vy})")
# 1. 长按
logger.info(f"👆 长按语音消息...")
d.long_click(vx, vy, 1.5)
time.sleep(1.0)
# 2. 截图菜单
menu_shot_path = os.path.join(screenshots_dir, "t6_menu_shot.jpg")
logger.info(f"📸 截取菜单: {menu_shot_path}")
d.screenshot(menu_shot_path)
# 3. OCR 识别
logger.info("🧠 正在进行 OCR 识别菜单...")
ocr_kit = EasyOcrKit()
ocr_results = ocr_kit.read_text(menu_shot_path)
convert_btn_center = None
for bbox, text, conf in ocr_results:
if "转文字" in text or "转换为文字" in text:
c_x = int((bbox[0][0] + bbox[2][0]) / 2)
c_y = int((bbox[0][1] + bbox[2][1]) / 2)
convert_btn_center = (c_x, c_y)
logger.info(f"✅ OCR 找到 '{text}' 按钮: {convert_btn_center}")
break
if convert_btn_center:
# 4. 点击转文字
logger.info(f"👆 点击转文字按钮: {convert_btn_center}")
d.click(convert_btn_center[0], convert_btn_center[1])
# 5. 动态等待
duration_str = content.replace('"', '').strip()
try:
duration = int(duration_str)
except:
duration = 10
wait_seconds = max(2, duration / 5.0)
logger.info(f"⏳ 语音时长 {duration}s模拟等待 {wait_seconds:.1f}s...")
time.sleep(wait_seconds)
logger.info("✅ 流程执行完毕!请检查手机屏幕是否已开始转换。")
else:
logger.error("❌ OCR 未找到 '转文字' 按钮!")
# 点击空白处关闭
d.click(vx + 200, vy)
if __name__ == "__main__":
if sys.platform.startswith('win'):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(main())