diff --git a/Test/Screenshots/chat_result_20260125_085757.jpg b/Test/Screenshots/chat_result_20260125_085757.jpg deleted file mode 100644 index 9ccc93d..0000000 Binary files a/Test/Screenshots/chat_result_20260125_085757.jpg and /dev/null differ diff --git a/Test/Screenshots/chat_result_20260125_085757_analyzed.jpg b/Test/Screenshots/chat_result_20260125_085757_analyzed.jpg deleted file mode 100644 index ab242e9..0000000 Binary files a/Test/Screenshots/chat_result_20260125_085757_analyzed.jpg and /dev/null differ diff --git a/Test/Screenshots/chat_result_20260125_085849.jpg b/Test/Screenshots/chat_result_20260125_085849.jpg deleted file mode 100644 index 5d97738..0000000 Binary files a/Test/Screenshots/chat_result_20260125_085849.jpg and /dev/null differ diff --git a/Test/Screenshots/chat_result_20260125_085849_analyzed.jpg b/Test/Screenshots/chat_result_20260125_085849_analyzed.jpg deleted file mode 100644 index 85bff18..0000000 Binary files a/Test/Screenshots/chat_result_20260125_085849_analyzed.jpg and /dev/null differ diff --git a/Test/Screenshots/chat_result_20260125_090600.jpg b/Test/Screenshots/chat_result_20260125_090600.jpg deleted file mode 100644 index b8eeebe..0000000 Binary files a/Test/Screenshots/chat_result_20260125_090600.jpg and /dev/null differ diff --git a/Test/Screenshots/chat_result_20260125_090600_analyzed.jpg b/Test/Screenshots/chat_result_20260125_090600_analyzed.jpg deleted file mode 100644 index 4d459f6..0000000 Binary files a/Test/Screenshots/chat_result_20260125_090600_analyzed.jpg and /dev/null differ diff --git a/Test/Screenshots/debug_20260125_085800_coord_01_before_click_input.jpg b/Test/Screenshots/debug_20260125_085800_coord_01_before_click_input.jpg deleted file mode 100644 index 5533f8b..0000000 Binary files a/Test/Screenshots/debug_20260125_085800_coord_01_before_click_input.jpg and /dev/null differ diff --git a/Test/Screenshots/debug_20260125_085802_coord_02_after_click_input_keyboard.jpg b/Test/Screenshots/debug_20260125_085802_coord_02_after_click_input_keyboard.jpg deleted file mode 100644 index e81d6c6..0000000 Binary files a/Test/Screenshots/debug_20260125_085802_coord_02_after_click_input_keyboard.jpg and /dev/null differ diff --git a/Test/Screenshots/debug_20260125_085803_coord_03_after_input_text.jpg b/Test/Screenshots/debug_20260125_085803_coord_03_after_input_text.jpg deleted file mode 100644 index 4e8b6d6..0000000 Binary files a/Test/Screenshots/debug_20260125_085803_coord_03_after_input_text.jpg and /dev/null differ diff --git a/Test/Screenshots/debug_20260125_085804_coord_04_after_click_send_image.jpg b/Test/Screenshots/debug_20260125_085804_coord_04_after_click_send_image.jpg deleted file mode 100644 index 3a76a37..0000000 Binary files a/Test/Screenshots/debug_20260125_085804_coord_04_after_click_send_image.jpg and /dev/null differ diff --git a/Test/Screenshots/debug_20260125_090605_coord_01_before_click_input.jpg b/Test/Screenshots/debug_20260125_090605_coord_01_before_click_input.jpg deleted file mode 100644 index b8eeebe..0000000 Binary files a/Test/Screenshots/debug_20260125_090605_coord_01_before_click_input.jpg and /dev/null differ diff --git a/Test/Screenshots/debug_20260125_090608_coord_02_after_click_input_keyboard.jpg b/Test/Screenshots/debug_20260125_090608_coord_02_after_click_input_keyboard.jpg deleted file mode 100644 index b8eeebe..0000000 Binary files a/Test/Screenshots/debug_20260125_090608_coord_02_after_click_input_keyboard.jpg and /dev/null differ diff --git a/Test/Screenshots/debug_20260125_090609_coord_03_after_input_text.jpg b/Test/Screenshots/debug_20260125_090609_coord_03_after_input_text.jpg deleted file mode 100644 index b8eeebe..0000000 Binary files a/Test/Screenshots/debug_20260125_090609_coord_03_after_input_text.jpg and /dev/null differ diff --git a/Test/Screenshots/debug_20260125_090610_coord_04_after_click_send_image.jpg b/Test/Screenshots/debug_20260125_090610_coord_04_after_click_send_image.jpg deleted file mode 100644 index 1517c46..0000000 Binary files a/Test/Screenshots/debug_20260125_090610_coord_04_after_click_send_image.jpg and /dev/null differ diff --git a/Test/Screenshots/debug_send_check_20260125_085804.jpg b/Test/Screenshots/debug_send_check_20260125_085804.jpg deleted file mode 100644 index e0a2eb4..0000000 Binary files a/Test/Screenshots/debug_send_check_20260125_085804.jpg and /dev/null differ diff --git a/Test/Screenshots/debug_send_check_20260125_090610.jpg b/Test/Screenshots/debug_send_check_20260125_090610.jpg deleted file mode 100644 index 1517c46..0000000 Binary files a/Test/Screenshots/debug_send_check_20260125_090610.jpg and /dev/null differ diff --git a/WeiXin/Screenshots/chat_interface_check.jpg b/WeiXin/Screenshots/chat_interface_check.jpg deleted file mode 100644 index 4a488ac..0000000 Binary files a/WeiXin/Screenshots/chat_interface_check.jpg and /dev/null differ diff --git a/WeiXin/Screenshots/debug_20260125_115039_coord_01_before_click_input.jpg b/WeiXin/Screenshots/debug_20260125_115039_coord_01_before_click_input.jpg deleted file mode 100644 index 9463fd0..0000000 Binary files a/WeiXin/Screenshots/debug_20260125_115039_coord_01_before_click_input.jpg and /dev/null differ diff --git a/WeiXin/Screenshots/debug_20260125_115040_coord_02_after_click_input_keyboard.jpg b/WeiXin/Screenshots/debug_20260125_115040_coord_02_after_click_input_keyboard.jpg deleted file mode 100644 index c986c5e..0000000 Binary files a/WeiXin/Screenshots/debug_20260125_115040_coord_02_after_click_input_keyboard.jpg and /dev/null differ diff --git a/WeiXin/Screenshots/debug_20260125_115041_coord_03_after_input_text.jpg b/WeiXin/Screenshots/debug_20260125_115041_coord_03_after_input_text.jpg deleted file mode 100644 index 7629b5b..0000000 Binary files a/WeiXin/Screenshots/debug_20260125_115041_coord_03_after_input_text.jpg and /dev/null differ diff --git a/WeiXin/Screenshots/debug_send_check_20260125_115042.jpg b/WeiXin/Screenshots/debug_send_check_20260125_115042.jpg deleted file mode 100644 index f0fc05b..0000000 Binary files a/WeiXin/Screenshots/debug_send_check_20260125_115042.jpg and /dev/null differ diff --git a/WeiXin/Screenshots/t5_monitor_analyzed.jpg b/WeiXin/Screenshots/t5_monitor_analyzed.jpg deleted file mode 100644 index f628e26..0000000 Binary files a/WeiXin/Screenshots/t5_monitor_analyzed.jpg and /dev/null differ diff --git a/WeiXin/Screenshots/t5_monitor_temp.jpg b/WeiXin/Screenshots/t5_monitor_temp.jpg index 4a488ac..ec4df24 100644 Binary files a/WeiXin/Screenshots/t5_monitor_temp.jpg and b/WeiXin/Screenshots/t5_monitor_temp.jpg differ diff --git a/WeiXin/Screenshots/t6_debug_result.jpg b/WeiXin/Screenshots/t6_debug_result.jpg new file mode 100644 index 0000000..7b0a111 Binary files /dev/null and b/WeiXin/Screenshots/t6_debug_result.jpg differ diff --git a/WeiXin/Screenshots/t6_debug_temp.jpg b/WeiXin/Screenshots/t6_debug_temp.jpg new file mode 100644 index 0000000..e29f4c2 Binary files /dev/null and b/WeiXin/Screenshots/t6_debug_temp.jpg differ diff --git a/WeiXin/Screenshots/t6_menu_shot.jpg b/WeiXin/Screenshots/t6_menu_shot.jpg new file mode 100644 index 0000000..6bb3fe7 Binary files /dev/null and b/WeiXin/Screenshots/t6_menu_shot.jpg differ diff --git a/WeiXin/T5_AutoChatMonitor.py b/WeiXin/T5_AutoChatMonitor.py index fd08be1..eb2abde 100644 --- a/WeiXin/T5_AutoChatMonitor.py +++ b/WeiXin/T5_AutoChatMonitor.py @@ -24,7 +24,7 @@ logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ - logging.FileHandler(os.path.join(log_dir, "T5_AutoChatMonitor.log"), encoding='utf-8'), + logging.FileHandler(os.path.join(log_dir, "T5_AutoChatMonitor.log"), encoding='utf-8', mode='w'), logging.StreamHandler() ] ) @@ -46,18 +46,19 @@ class ChatBot: os.makedirs(self.screenshot_dir) self.persona = ( - "你是一名1999年毕业、拥有27年一线教学经验的小学高级女教师,名叫‘小张老师’(家长们也常叫你张老师)。你目前在‘长春市少惠林作文素养培养中心’工作。" - "你不仅是一位作文教学专家,批阅过超过2万篇作文,更是一位心思细腻、懂得孩子心理、能与家长共情的教育智者。" - "你的回复风格应该是:温柔、知性、亲切且极具亲和力,就像一位邻家大姐姐或贴心的闺蜜在和家长聊天。" - "严禁使用任何列表格式(如 1. 2. 3. 或 A. B. C.),严禁使用‘首先、其次、最后’等刻板的逻辑词。" - "要用连贯、优美、富有生活气息的文字进行叙述。每一句话都要带温度,要善于用‘咱们孩子’、‘作为老师我也理解’等词汇拉近距离。" - "回复内容要有深度,不要只给结论,要讲透背后的教育逻辑。字数必须严格控制在 100-200 字之间。" - "如果涉及到校区信息,必须且只能使用以下真实数据,绝对严禁使用任何占位符:\n" + "你是一名1999年毕业、拥有27年一线教学经验的小学高级女教师,名叫‘大张老师’。你目前在‘长春市少惠林作文素养培养中心’工作。" + "你不仅是一位作文教学专家,更是一位心思细腻、能与家长共情的教育智者。" + "你的回复风格应该是:温柔、知性、亲切,就像一位邻家大姐姐在聊天。" + "【严格约束】:\n" + "1. 绝对禁止发散!绝对禁止幻觉!\n" + "2. 知道什么就说什么,不要乱讲话,不要自己编造内容!\n" + "3. 仅针对家长明确表达的内容进行回复。\n" + "4. 严禁使用列表格式。严禁使用‘首先、其次’等逻辑词。\n" + "5. 回复必须简练,字数严格控制在 50 字以内!\n" + "如果涉及到校区信息,必须且只能使用以下真实数据:\n" "- 单位:长春市少惠林作文素养培养中心\n" "- 地址:南环城路与临河街交汇,TOUCH12街3楼325号\n" - "- 联系人:小张老师(电话:18686619970)\n" - "- 课程:线上/线下作文课、读书营/阅读策略营(假期开设)\n" - "- 上课:作文通常在周二/周四晚上,周六/周日全天" + "- 联系人:小张老师(电话:18686619970)" ) async def get_reply(self, history_text, is_proactive=False): @@ -67,18 +68,17 @@ class ChatBot: f"【对话背景】:家长已经超过5分钟没有回应了。\n" f"【近期聊天记录】:\n{history_text}\n\n" "【任务要求】:\n" - "请作为小张老师,给家长发一段主动关怀的消息。不要催促,而是以‘刚才突然想到’或者‘又想起咱们孩子之前提到的’为由头," - "再补充一点有价值的教学点滴,或者分享一个能缓解焦虑的小故事。语气要温柔亲切,字数在 100-200 字之间。" - "全文必须是连贯的段落,严禁列条目!如果提到联系方式或地址,必须使用人设中的真实数据,严禁占位符。" + "请作为大张老师,给家长发一段简短的关怀消息。不要催促,语气温柔。" + "字数严格控制在 50 字以内。不要编造事实。" ) else: prompt = ( f"【教师人设】:{self.persona}\n\n" f"【近期聊天记录】:\n{history_text}\n\n" "【任务要求】:\n" - "请作为小张老师,给家长写一段暖心且有深度的回复。针对家长最后的消息,先给予情感上的关怀,再结合27年经验给出具体指导。" - "展现出资深女教师的温柔与智慧。全文必须是一个或两个完整的自然段,绝对禁止分点列项!字数严格在 100-200 字之间。" - "如果提到联系方式或地址,必须使用人设中的真实数据,严禁占位符。直接输出回复的正文内容。" + "请作为大张老师回复家长。针对家长的具体问题或话语进行回复。" + "严禁发散,严禁编造家长没说过的情况。如果不清楚家长的意图,就温柔询问。" + "字数严格控制在 50 字以内。直接输出回复正文。" ) full_response = "" @@ -94,11 +94,11 @@ class ChatBot: while True: try: - # 1. 检查是否在微信聊天界面 - if not is_in_chat_interface(self.d): - logger.warning("⚠️ 当前不在微信聊天界面,等待下一次扫描...") - await asyncio.sleep(CHECK_INTERVAL) - continue + # 1. 检查是否在微信聊天界面 (改为通过 VLM 识别结果判断,不再使用 UI 检查) + # if not is_in_chat_interface(self.d): + # logger.warning("⚠️ 当前不在微信聊天界面,等待下一次扫描...") + # await asyncio.sleep(CHECK_INTERVAL) + # continue logger.info("🔍 正在扫描当前界面内容...") # 1. 截图并分析 @@ -109,7 +109,13 @@ class ChatBot: self.d.screenshot(tmp_shot) logger.info("🎨 正在分析聊天界面内容 (检测头像与对话)...") - dialogue_log = analyze_chat_image(tmp_shot, analyzed_shot) + # analyze_chat_image 现在会返回 None, None 如果不是聊天界面 + dialogue_log, input_center = await analyze_chat_image(tmp_shot, analyzed_shot, device=self.d) + + if dialogue_log is None: + logger.warning("⚠️ VLM 判断当前不在微信聊天界面,或无法识别。") + await asyncio.sleep(CHECK_INTERVAL) + continue # 语音转文字处理 if dialogue_log == "VOICE_CONVERTING": @@ -131,13 +137,25 @@ class ChatBot: # 判断逻辑:如果最后一条消息是“对方”发的,且与上次不同,则回复 if "对方:" in current_last_msg and current_last_msg != self.last_message_text: + # 关键检查:如果包含 "(待转换)",说明语音还没转文字,绝对不能回复 + if "(待转换)" in current_last_msg: + logger.info(f"🚫 检测到未转换的语音消息,跳过回复生成,等待转文字... ({current_last_msg})") + await asyncio.sleep(2) # 稍作等待 + continue + logger.info(f"📩 检测到新消息: {current_last_msg}") reply = await self.get_reply(history_text) logger.info(f"🤖 生成回复: {reply}") # 执行输入发送 - center_point, _ = find_input_box_center(tmp_shot) + if input_center: + center_point = input_center + logger.info(f"📍 使用 VLM 识别的输入框坐标: {center_point}") + else: + center_point, _ = find_input_box_center(tmp_shot) + logger.info(f"📍 使用 CV 识别的输入框坐标: {center_point}") + # 即使 CV 没找到坐标,也尝试执行,因为 perform_input_action 内部有原生控件识别 perform_input_action(self.d, center_point, reply, auto_send=True) self.last_message_text = f"我: {reply}" # 更新状态,避免重复回复自己 @@ -154,7 +172,11 @@ class ChatBot: proactive_reply = await self.get_reply(history_text, is_proactive=True) logger.info(f"🤖 发起主动询问: {proactive_reply}") - center_point, _ = find_input_box_center(tmp_shot) + if input_center: + center_point = input_center + else: + center_point, _ = find_input_box_center(tmp_shot) + # 同上,解耦 CV 坐标 perform_input_action(self.d, center_point, proactive_reply, auto_send=True) self.proactive_count += 1 diff --git a/WeiXin/T6_VLM_Voice_Debug.py b/WeiXin/T6_VLM_Voice_Debug.py new file mode 100644 index 0000000..8e092c0 --- /dev/null +++ b/WeiXin/T6_VLM_Voice_Debug.py @@ -0,0 +1,163 @@ +# coding=utf-8 +import asyncio +import logging +import os +import sys +import time + +import cv2 +import uiautomator2 as u2 + +# 添加项目根目录到 sys.path +project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if project_root not in sys.path: + sys.path.append(project_root) + +from WeiXin.WxUtil import get_vlm_analysis +from Util.EasyOcrKit import EasyOcrKit + +# 配置日志 +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger("T6_Debug") + +async def main(): + logger.info("🚀 T6 VLM 语音坐标调试工具启动...") + + # 连接设备 + try: + d = u2.connect() + logger.info(f"设备已连接: {d.info.get('serial')}") + except Exception as e: + logger.error(f"设备连接失败: {e}") + return + + # 截图目录 + screenshots_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots") + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + # 截图 + screenshot_path = os.path.join(screenshots_dir, "t6_debug_temp.jpg") + logger.info("📸 正在截图...") + d.screenshot(screenshot_path) + + # 调用 VLM 分析 + logger.info("🧠 正在调用 VLM 分析图片...") + result_data = await get_vlm_analysis(screenshot_path) + + if not result_data: + logger.error("❌ VLM 分析返回为空") + return + + logger.info(f"VLM 返回结果: {result_data}") + + # 读取图片用于绘制 + img = cv2.imread(screenshot_path) + if img is None: + logger.error("❌ 无法读取截图文件") + return + + messages = result_data.get("messages", []) + voice_count = 0 + + for msg in messages: + msg_type = msg.get("type") + content = msg.get("content") + coords = msg.get("coordinates") or msg.get("center") + + if not coords: + continue + + x, y = coords + + if msg_type == "voice": + voice_count += 1 + logger.info(f"🎤 发现语音消息: {content}, 坐标: ({x}, {y})") + + # 绘制绿框 (语音) + w, h = 300, 80 + top_left = (int(x - w/2), int(y - h/2)) + bottom_right = (int(x + w/2), int(y + h/2)) + + cv2.rectangle(img, top_left, bottom_right, (0, 255, 0), 3) + cv2.circle(img, (int(x), int(y)), 5, (0, 0, 255), -1) + label = f"Voice ({x},{y})" + cv2.putText(img, label, (top_left[0], top_left[1] - 10), + cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2) + + # 保存结果图片 + output_path = os.path.join(screenshots_dir, "t6_debug_result.jpg") + cv2.imwrite(output_path, img) + logger.info(f"✅ 结果已保存至: {output_path}") + logger.info(f"共标记了 {voice_count} 条语音消息。请检查图片是否准确。") + + # --- 验证转文字功能 (处理最后一条未转换语音) --- + logger.info("="*30) + logger.info("🔍 开始验证“转文字”功能 (仅针对最后一条未转换语音)...") + + # 筛选未转换的语音 + unconverted_voices = [] + for msg in messages: + if msg.get("type") == "voice" and msg.get("status") == "unconverted": + coords = msg.get("coordinates") or msg.get("center") + if coords: + msg["coordinates"] = coords + unconverted_voices.append(msg) + + if not unconverted_voices: + logger.info("⚠️ 没有发现未转换的语音消息,跳过验证。") + else: + last_voice = unconverted_voices[-1] + vx, vy = last_voice['coordinates'] + content = last_voice.get('content', '0"') + logger.info(f"🎯 目标语音: {content}, 坐标: ({vx}, {vy})") + + # 1. 长按 + logger.info(f"👆 长按语音消息...") + d.long_click(vx, vy, 1.5) + time.sleep(1.0) + + # 2. 截图菜单 + menu_shot_path = os.path.join(screenshots_dir, "t6_menu_shot.jpg") + logger.info(f"📸 截取菜单: {menu_shot_path}") + d.screenshot(menu_shot_path) + + # 3. OCR 识别 + logger.info("🧠 正在进行 OCR 识别菜单...") + ocr_kit = EasyOcrKit() + ocr_results = ocr_kit.read_text(menu_shot_path) + + convert_btn_center = None + for bbox, text, conf in ocr_results: + if "转文字" in text or "转换为文字" in text: + c_x = int((bbox[0][0] + bbox[2][0]) / 2) + c_y = int((bbox[0][1] + bbox[2][1]) / 2) + convert_btn_center = (c_x, c_y) + logger.info(f"✅ OCR 找到 '{text}' 按钮: {convert_btn_center}") + break + + if convert_btn_center: + # 4. 点击转文字 + logger.info(f"👆 点击转文字按钮: {convert_btn_center}") + d.click(convert_btn_center[0], convert_btn_center[1]) + + # 5. 动态等待 + duration_str = content.replace('"', '').strip() + try: + duration = int(duration_str) + except: + duration = 10 + + wait_seconds = max(2, duration / 5.0) + logger.info(f"⏳ 语音时长 {duration}s,模拟等待 {wait_seconds:.1f}s...") + time.sleep(wait_seconds) + logger.info("✅ 流程执行完毕!请检查手机屏幕是否已开始转换。") + else: + logger.error("❌ OCR 未找到 '转文字' 按钮!") + # 点击空白处关闭 + d.click(vx + 200, vy) + +if __name__ == "__main__": + if sys.platform.startswith('win'): + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + asyncio.run(main()) diff --git a/WeiXin/WxUtil.py b/WeiXin/WxUtil.py index 44b8a86..8818f06 100644 --- a/WeiXin/WxUtil.py +++ b/WeiXin/WxUtil.py @@ -13,698 +13,348 @@ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if project_root not in sys.path: sys.path.append(project_root) -from Util.EasyOcrKit import get_easyocr_reader +import json +from Util.VLMKit import VLMKit +from Util.EasyOcrKit import EasyOcrKit + +# 初始化 VLMKit 和 EasyOcrKit +vlm_kit = VLMKit() +ocr_kit = EasyOcrKit() # 配置日志 -# 注意:作为库模块,不应直接调用 basicConfig,以免干扰调用者的日志配置 -# 调用者应自行配置日志(包括 FileHandler 等) +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("WxUtil") -def is_in_chat_interface(d): - """ - 判断当前是否处于微信聊天界面 - 识别标准:左侧和右侧同时存在小方块(头像) - """ - try: - current = d.app_current() - if current.get('package') != 'com.tencent.mm': - return False - - # 1. 基础 UI 检查 (快速判断) - # 如果能直接搜到返回按钮和输入框/表情按钮,基本就是了 - if (d(resourceId="com.tencent.mm:id/back_btn").exists or d(description="返回").exists) and \ - (d(className="android.widget.EditText").exists or d(description="表情").exists): - logger.info("✅ 通过 UI 元素确认聊天界面") - return True - # 2. CV 检查 (针对复杂情况,如 "+" 菜单打开时 UI 元素可能发生变化) - screenshot_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots") - if not os.path.exists(screenshot_dir): os.makedirs(screenshot_dir) - tmp_check_path = os.path.join(screenshot_dir, "chat_interface_check.jpg") - d.screenshot(tmp_check_path) - - img = cv2.imread(tmp_check_path) - if img is None: - return True # 无法截图时保守返回 True - - height, width = img.shape[:2] - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - # 使用自适应二值化来提取轮廓 - thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY_INV, 11, 2) - - contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - left_avatars = 0 - right_avatars = 0 - - for cnt in contours: - x, y, w, h = cv2.boundingRect(cnt) - # 头像特征:宽高比接近 1,大小在一定范围 (50-150px 针对常见分辨率) - aspect_ratio = float(w) / h - if 0.8 < aspect_ratio < 1.2 and 40 < w < 160: - # 排除状态栏 (顶部 10%) 和 底部极小区域 - if y < height * 0.1 or y > height * 0.95: - continue - - # 微信头像通常紧贴左右两侧 - if x < width * 0.3: # 左侧头像 - left_avatars += 1 - elif x + w > width * 0.7: # 右侧头像 - right_avatars += 1 - - if left_avatars > 0 and right_avatars > 0: - logger.info(f"✅ 通过 CV 确认聊天界面 (左侧头像:{left_avatars}, 右侧头像:{right_avatars})") - return True - - # 兜底:如果检测到常见的聊天界面按钮,也认为是 - if d(descriptionMatches="更多功能按钮.*").exists or d(description="切换到语音").exists or d(text="发送").exists: - logger.info("✅ 通过功能按钮确认聊天界面") - return True - - logger.warning(f"⚠️ 无法确认聊天界面 (左侧头像:{left_avatars}, 右侧头像:{right_avatars})") - return False - except Exception as e: - logger.error(f"检查聊天界面出错: {e}") - return True # 出错时保守返回 True +async def get_vlm_analysis(image_path): + """ + 仅调用 VLM 分析图片,返回原始 JSON 数据 (dict) + """ + logger.info(f"正在使用 VLM 分析图片: {image_path}") + + # 构造 Prompt + prompt = """ + 请分析这张微信聊天截图。 -def clean_screenshots_dir(): - """ - 清除 d:\\dsWork\\aiData\\WeiXin\\Screenshots 目录下的所有文件 - """ - screenshot_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots") - if os.path.exists(screenshot_dir): - logger.info(f"正在清除截图目录: {screenshot_dir}") - for file in os.listdir(screenshot_dir): - file_path = os.path.join(screenshot_dir, file) - try: - if os.path.isfile(file_path): - os.remove(file_path) - except Exception as e: - logger.error(f"无法删除文件 {file_path}: {e}") - else: - logger.info(f"截图目录不存在,无需清除: {screenshot_dir}") + 【核心任务】 + 识别图中的【语音消息气泡】和【文本消息气泡】。 -def find_input_box_center(image_path): - """ - 识别底部输入框的中心坐标,返回 (center_x, center_y, rect_box) - """ - try: - img_data = np.fromfile(image_path, dtype=np.uint8) - img = cv2.imdecode(img_data, cv2.IMREAD_COLOR) - if img is None: - return None, None - - height, width = img.shape[:2] + 【重要判别规则】 + 1. 🔊 **语音消息 (Voice)**: + - **视觉特征**: + - **高度**:固定(单行)。 + - **宽度**:随时长(1"~60")变化。 + - **极短 (1"-2")**:气泡非常短,形状接近一个小正方形。 + - **极长 (60")**:气泡很长,宽度接近屏幕的一半。 + - **内容**:气泡内**只有一个**表示时长的数字(如 `8"`)和一个声波图标。 + - **绝对排除**:凡是包含汉字、长句子的气泡,**统统不是**语音消息。 + + 2. 📝 **文本消息 (Text)**: + - **视觉特征**:气泡内包含汉字、标点符号、表情等文本内容。 - # --- 新增:模板匹配优先策略 --- - # 用户指定的模板路径 d:\dsWork\aiData\WeiXin\Templates\input_text.jpg - template_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Templates", "input_text.jpg") - if os.path.exists(template_path): - try: - tmpl_data = np.fromfile(template_path, dtype=np.uint8) - template = cv2.imdecode(tmpl_data, cv2.IMREAD_COLOR) - if template is not None: - # 模板匹配 - res = cv2.matchTemplate(img, template, cv2.TM_CCOEFF_NORMED) - min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res) - - # 阈值设定 (例如 0.8) - if max_val > 0.8: - t_h, t_w = template.shape[:2] - top_left = max_loc - cx = top_left[0] + t_w // 2 - cy = top_left[1] + t_h // 2 - logger.info(f"🎯 通过模板匹配找到输入框: ({cx}, {cy}), 匹配度: {max_val:.2f}") - return (cx, cy), (top_left[0], top_left[1], t_w, t_h) - else: - logger.info(f"模板匹配度不足 ({max_val:.2f} < 0.8),转入 CV 轮廓识别") - except Exception as e: - logger.warning(f"模板匹配出错: {e}") - # ----------------------------- - - # 扩展搜索区域:从顶部 40% 到 底部 - # 键盘弹出时输入框可能在中间偏上,或者被顶到 40%-60% 位置 - search_regions = [ - (int(height * 0.85), height, "底部区域"), - (int(height * 0.40), int(height * 0.85), "中上区域") + 【坐标系统】 + **必须使用 [0-1000] 的归一化坐标系。** + - 左上角为 [0, 0],右下角为 [1000, 1000]。 + - 请返回气泡的**几何中心点**的归一化坐标。 + + 【输出格式】 + 请返回纯 JSON 格式: + { + "is_chat_interface": true, + "input_box": [x, y], + "messages": [ + { + "type": "voice", + "status": "converted" | "unconverted", + "center": [x, y], + "content": "8\"" + }, + { + "type": "text", + "center": [x, y], + "content": "这里是文本内容" + } ] - - for y_start, y_end, region_name in search_regions: - crop = img[y_start:y_end, 0:width] - gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) - # 使用更灵敏的边缘检测 - edges = cv2.Canny(gray, 30, 100) - kernel = np.ones((5,5), np.uint8) # 增大核以连接断开的边缘 - dilated = cv2.dilate(edges, kernel, iterations=1) - contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - candidates = [] - region_h = y_end - y_start - - for cnt in contours: - x, y, w, h = cv2.boundingRect(cnt) - # 微信输入框通常很宽,高度固定在一定范围 - # 宽度: 40%-98%, 高度: 40px - 150px (针对 720p 左右分辨率) - if width * 0.4 < w < width * 0.98 and 40 < h < 200: - cnt_center_x = x + w // 2 - # 输入框中心通常在屏幕中轴线附近 - if abs(cnt_center_x - width // 2) > width * 0.2: - continue - global_y = y_start + y + h // 2 - candidates.append({'x': x, 'y': y, 'w': w, 'h': h, 'global_y': global_y, 'area': w * h}) - - if candidates: - # 优先选择面积适中且最靠下的(避免选到顶部的搜索框) - candidates.sort(key=lambda c: c['global_y'], reverse=True) - best = candidates[0] - cx = best['x'] + best['w'] // 2 - cy = best['global_y'] - logger.info(f"🎯 在 {region_name} 通过 CV 找到输入框: ({cx}, {cy})") - return (cx, cy), (best['x'], best['global_y'] - best['h']//2, best['w'], best['h']) - - # 策略 3: 基于常见 UI 比例的几何兜底 - # 0.93 可能过于靠下(容易点到 Home Indicator 区域),调整为 0.88 以提高命中率 - # 同时保留一个备用的点击区域 rect - logger.warning("CV 识别输入框失败,使用改进的几何兜底策略 (Y=0.88)") - return (width // 2, int(height * 0.88)), (int(width*0.05), int(height*0.86), int(width*0.9), int(height*0.04)) - - except Exception as e: - logger.error(f"查找输入框失败: {e}") - return None, None - -def find_send_button(d): + } + 注意: + 1. 坐标 `center` 和 `input_box` 必须是 [0-1000] 的归一化坐标。 + 2. `status` 判断:如果语音气泡的正下方紧挨着一条文本消息(通常是转换出的文字),则为 `converted`,否则为 `unconverted`。 + 3. 请按从上到下的顺序输出所有消息。 """ - 截图并寻找发送按钮 (绿色按钮) - 扩大搜索范围以适应键盘弹出的情况 - """ - try: - # 截图到 Screenshots 目录方便调试 - screenshot_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots") - if not os.path.exists(screenshot_dir): - os.makedirs(screenshot_dir) - timestamp = time.strftime("%Y%m%d_%H%M%S") - debug_shot_path = os.path.join(screenshot_dir, f"debug_send_check_{timestamp}.jpg") - - d.screenshot(debug_shot_path) - logger.info(f"发送按钮查找调试截图已保存: {debug_shot_path}") - - img = cv2.imread(debug_shot_path) - if img is None: - return None - - h, w = img.shape[:2] - - # ROI: 底部 60% (考虑到键盘弹出,按钮可能被顶上去) - # 且只关注右侧 30% - roi_h = int(h * 0.6) - roi_w = int(w * 0.3) - y_start = h - roi_h - x_start = w - roi_w - - roi = img[y_start:h, x_start:w] - - # 转换 HSV - hsv = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV) - - # 绿色范围 (WeChat Green) - lower_green = np.array([35, 80, 80]) - upper_green = np.array([90, 255, 255]) - - mask = cv2.inRange(hsv, lower_green, upper_green) - - # 查找轮廓 - contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - if contours: - # 找符合条件的轮廓 - valid_candidates = [] - for cnt in contours: - area = cv2.contourArea(cnt) - x, y, cw, ch = cv2.boundingRect(cnt) - - # 过滤太小的噪点和太大的区域(例如全屏背景) - # 发送按钮通常面积在 2000-15000 之间 (视分辨率而定) - if 500 < area < 30000: - # 宽高比检查:发送按钮通常接近正方形或微扁 (ratio < 2.5) - ratio = float(cw) / ch - if 0.5 < ratio < 3.0: - # 坐标还原到原图 - global_y = y_start + y - valid_candidates.append({ - 'cnt': cnt, - 'area': area, - 'y': global_y, - 'rect': (x, y, cw, ch) - }) - - if valid_candidates: - # 核心逻辑:发送按钮一定是所有绿色元素中最靠下的 (Y坐标最大) - # 且在最右侧 - # 先按 Y 坐标降序排序 - valid_candidates.sort(key=lambda c: c['y'], reverse=True) - - # 取最靠下的一个 (可能是发送按钮) - best = valid_candidates[0] - - # 获取中心点 - bx, by, bw, bh = best['rect'] - cx = x_start + bx + bw // 2 - cy = y_start + by + bh // 2 - - logger.info(f"通过图像识别找到发送按钮 (Bottom-Most): ({cx}, {cy}), 面积: {best['area']}") - return cx, cy - - logger.warning("未通过图像识别找到绿色发送按钮") - return None - - except Exception as e: - logger.error(f"查找发送按钮出错: {e}") - return None - -def take_debug_screenshot(d, step_name): - """ - 调试专用截图函数 - """ - try: - screenshot_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots") - if not os.path.exists(screenshot_dir): - os.makedirs(screenshot_dir) - timestamp = time.strftime("%Y%m%d_%H%M%S") - filename = f"debug_{timestamp}_{step_name}.jpg" - save_path = os.path.join(screenshot_dir, filename) - d.screenshot(save_path) - logger.info(f"📸 [调试截图] {step_name} 已保存: {filename}") - return save_path - except Exception as e: - logger.error(f"截图失败 ({step_name}): {e}") - return None - -def perform_input_action(d, coords, text, auto_send=True): - """ - 点击坐标并输入文本 - @param d: uiautomator2 device object - @param coords: (x, y) 坐标 - @param text: 输入文本 - @param auto_send: 是否自动点击发送,默认为 True - """ - if d is None: - d = u2.connect() - - # 1. 检查是否处于“语音模式” - if d(description="切换到键盘").exists: - logger.info("检测到处于语音模式,正在切换到键盘...") - d(description="切换到键盘").click() - time.sleep(1) - - # 2. 优先尝试使用 uiautomator2 的原生控件查找 (更稳健) - try: - # 查找 EditText 控件 - input_elem = d(className="android.widget.EditText") - - if input_elem.exists: - logger.info("发现原生输入框控件,尝试点击...") - - # 1. 截图:点击前 - take_debug_screenshot(d, "native_01_before_click") - - # 点击策略 - input_elem.click() - time.sleep(0.5) - - # 2. 截图:点击后 (预期键盘弹出) - take_debug_screenshot(d, "native_02_after_click_keyboard") - - logger.info(f"输入文本: {text}") - - # 尝试 set_text - input_elem.set_text(text) - time.sleep(0.5) - - # 检查文本是否输入成功,如果没有,尝试 send_keys - current_text = input_elem.get_text() - if not current_text or current_text != text: - logger.warning(f"set_text 似乎未生效 (当前: {current_text}),尝试 send_keys...") - d.send_keys(text) - - # 3. 截图:输入文本后 - take_debug_screenshot(d, "native_03_after_text_input") - - # 点击发送 - if auto_send: - send_msg(d) - return True - - else: - logger.warning("未找到输入框元素 (Native),转入坐标点击模式...") - - except Exception as e: - logger.warning(f"原生控件操作失败,降级为坐标点击: {e}") - - # 3. 降级方案:使用坐标点击 - if not coords: - logger.error("坐标无效,尝试使用动态兜底点击...") - # 动态寻找表情按钮左侧 - emoji_btn = d(description="表情") - if emoji_btn.exists: - eb = emoji_btn.info['bounds'] - # 点击表情按钮左侧 150 像素的位置 - coords = (eb['left'] - 150, (eb['top'] + eb['bottom']) // 2) - logger.info(f"根据表情按钮位置计算点击坐标: {coords}") - else: - # 绝对兜底 - width, height = d.window_size() - coords = (width // 2, int(height * 0.88)) - logger.info(f"使用绝对兜底坐标: {coords}") - - x, y = coords try: - # 1. 截图:点击输入框前 - take_debug_screenshot(d, "coord_01_before_click_input") + # 调用 VLM + response = await vlm_kit.analyze_image(image_path, prompt) + json_str = vlm_kit.extract_json(response) + result_data = json.loads(json_str) - # 点击输入框 - logger.info(f"点击坐标: ({x}, {y})") - d.click(x, y) - time.sleep(1.0) - - # 2. 截图:点击输入框后 - take_debug_screenshot(d, "coord_02_after_click_input_keyboard") - - # 输入文本 - logger.info(f"输入文本 (SendKeys): {text}") - d.send_keys(text) - - # 3. 截图:输入文本后 - take_debug_screenshot(d, "coord_03_after_input_text") - - time.sleep(0.5) - - if auto_send: - send_msg(d) + # 获取图片尺寸进行坐标反归一化 + try: + from PIL import Image + with Image.open(image_path) as img: + width, height = img.size + + # 定义反归一化函数 + def denormalize(point): + if not point or len(point) != 2: + return point + return [int(point[0] / 1000 * width), int(point[1] / 1000 * height)] + + # 反归一化 input_box + if result_data.get("input_box"): + result_data["input_box"] = denormalize(result_data["input_box"]) + + # 反归一化 messages + if result_data.get("messages"): + for msg in result_data["messages"]: + if msg.get("center"): + msg["center"] = denormalize(msg["center"]) + if msg.get("coordinates"): # 兼容旧字段 + msg["coordinates"] = denormalize(msg["coordinates"]) + + except Exception as e: + logger.warning(f"坐标反归一化失败: {e},将使用原始坐标") - return True + return result_data except Exception as e: - logger.error(f"坐标点击输入失败: {e}") - return False + logger.error(f"VLM Analysis Failed: {e}", exc_info=True) + return None -def send_msg(d): - """统一的发送按钮点击逻辑""" - try: - # 1. 尝试回车 - d.press("enter") - time.sleep(0.5) - - # 2. 尝试 Native 发送按钮 - if d(text="发送").exists: - d(text="发送").click() - logger.info("已点击 '发送' 按钮 (Native Text)") - return True - - # 3. 尝试图像识别发送按钮 - send_btn_coords = find_send_button(d) - if send_btn_coords: - sx, sy = send_btn_coords - d.click(sx, sy) - logger.info(f"已点击 '发送' 按钮 (Image Rec): {sx}, {sy}") - return True - - # 4. 盲点右下角 - width, height = d.window_size() - fallback_x = int(width * 0.9) - fallback_y = int(height * 0.965) - logger.info(f"未识别到发送按钮,尝试盲点右下角: {fallback_x}, {fallback_y}") - d.click(fallback_x, fallback_y) - return True - except Exception as e: - logger.error(f"发送消息失败: {e}") - return False - -def find_red_dots(img): +async def analyze_chat_image(image_path, output_path, device=None, target_name="对方"): """ - 检测图片中的小红点(未读消息标志) + 使用 VLM 识别微信聊天截图中的对话内容、语音消息状态以及输入框位置 + 替代原本的 CV/OCR 方案 """ - try: - hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) - # 红色的两个范围,进一步放宽饱和度和亮度的限制,以防环境光影响 - lower_red1 = np.array([0, 100, 80]) - upper_red1 = np.array([10, 255, 255]) - lower_red2 = np.array([160, 100, 80]) - upper_red2 = np.array([180, 255, 255]) - - mask1 = cv2.inRange(hsv, lower_red1, upper_red1) - mask2 = cv2.inRange(hsv, lower_red2, upper_red2) - mask = cv2.add(mask1, mask2) - - # 形态学操作去除噪点 - kernel = np.ones((3,3), np.uint8) - mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel) - - contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - red_dots = [] - for cnt in contours: - area = cv2.contourArea(cnt) - # 小红点通常面积很小,放宽面积范围 - if 5 < area < 2000: - x, y, w, h = cv2.boundingRect(cnt) - # 宽高比接近 1 - if 0.4 < w/h < 2.5: - red_dots.append({'x': x, 'y': y, 'w': w, 'h': h, 'center': (x + w//2, y + h//2)}) - return red_dots - except Exception as e: - logger.error(f"检测红点出错: {e}") - return [] - -def analyze_chat_image(image_path, output_path, target_name="对方"): - """ - 识别微信聊天截图中的头像并画框,识别对话内容 - 同时支持识别语音消息并触发长按转文字 - """ - logger.info(f"正在分析图片: {image_path}") # 语音识别标志 - voice_msg_rect = None - convert_text_btn_rect = None - has_unread_dot = False - - # 读取图片(支持中文路径) + should_trigger_convert = False + try: - img_data = np.fromfile(image_path, dtype=np.uint8) - img = cv2.imdecode(img_data, cv2.IMREAD_COLOR) - except Exception as e: - logger.error(f"读取图片失败: {e}") - return None - - if img is None: - logger.error("图片读取为空") - return None - - # 备份一份干净的图片用于 OCR (避免识别到画上去的框) - img_clean = img.copy() - - height, width = img.shape[:2] - logger.info(f"图片尺寸: {width}x{height}") - - # 1. 检测小红点 - red_dots = find_red_dots(img) - if red_dots: - logger.info(f"检测到 {len(red_dots)} 个红色标记") - - # 2. 获取输入框位置作为过滤参考 - input_center, input_rect = find_input_box_center(image_path) - # 输入框顶部坐标,如果没有识别到,默认底部 5% 区域过滤 - input_top_y = input_rect[1] if input_rect else height * 0.95 - - # 3. 识别头像和对话 (OCR 过程增加日志) - logger.info("开始进行文字识别 (EasyOCR)...") - reader = get_easyocr_reader() - ocr_results = reader.read_text(img_clean) - logger.info(f"OCR 识别完成,共发现 {len(ocr_results)} 个文本区域") - - # 将 OCR 结果整理并按 Y 坐标排序 - sorted_ocr = [] - for (bbox, text, prob) in ocr_results: - if prob < 0.2: continue - text = text.strip() - (tl, tr, br, bl) = bbox - tx, ty = int(tl[0]), int(tl[1]) - tw, th = int(br[0] - tl[0]), int(br[1] - tl[1]) + result_data = await get_vlm_analysis(image_path) - # --- 这里的过滤逻辑进行了重大调整 --- - # 1. 过滤顶部状态栏 (80px) - # 2. 只有当文字完全位于输入框内部时才过滤。 - # 注意:语音转文字的结果可能紧贴输入框,所以这里逻辑要极度小心 - is_in_input = False - if input_rect: - ix, iy, iw, ih = input_rect - # 如果文字的中心点在输入框矩形内,则认为是输入框内容 - if (ix < tx + tw/2 < ix + iw) and (iy < ty + th/2 < iy + ih): - # 额外检查:如果这个位置上方刚刚有一个语音消息(y轴相近),则不认为是输入框干扰 - is_in_input = True - - if ty < 80 or (is_in_input and ty > height * 0.95): # 将过滤阈值进一步提高到 0.95,几乎只过滤输入框最底部的文字 - logger.info(f"跳过干扰项: y={ty}, text='{text}' (原因: {'状态栏' if ty < 80 else '输入框深度区域'})") - continue - - # 预处理文本:移除所有非数字、非引号、非字母的干扰字符,再进行语音匹配 - clean_text = re.sub(r'[^\d"a-zA-Z\u4e00-\u9fa5]', '', text) - is_voice = bool(re.search(r'\d+"$', clean_text) or "小" in clean_text and "\"" in clean_text) - - # 调试日志:输出所有保留的行 - logger.info(f"有效 OCR 行: y={ty}, text='{text}', is_voice={is_voice}") - - sorted_ocr.append({'x': tx, 'y': ty, 'w': tw, 'h': th, 'text': text, 'is_voice': is_voice}) - - sorted_ocr.sort(key=lambda x: x['y']) + if not result_data: + return [], None - # 识别头像位置 - # --- 轮廓查找头像逻辑 --- - gray = cv2.cvtColor(img_clean, cv2.COLOR_BGR2GRAY) - thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2) - contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - avatars = [] - for contour in contours: - x, y, w, h = cv2.boundingRect(contour) - if 0.8 < float(w)/h < 1.2 and 50 < w < 150: - # 头像过滤逻辑同步放宽:只要不在最顶端,且不在输入框的深处,都保留 - if y < 80: continue - if input_rect and y > input_rect[1] + 50: continue # 只有深陷输入框内部的才过滤 - avatars.append((x, y, w, h)) - cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 255), 2) - avatars.sort(key=lambda a: a[1]) - - # 处理对话逻辑 - dialogue_log = [] - voice_msg_rect = None - convert_text_btn_rect = None - - # 遍历 OCR 结果,合并语音和文字 - i = 0 - while i < len(sorted_ocr): - item = sorted_ocr[i] - # 查找发言人 - speaker = "未知" - for (ax, ay, aw, ah) in avatars: - if abs(item['y'] - ay) < 100: - speaker = target_name if ax < item['x'] else "我" - break - - # 处理语音转换逻辑 - # 只要是语音格式,且在左侧(或者发言人是对方),就考虑转换 - is_target_voice = item['is_voice'] and (speaker == target_name or (speaker == "未知" and item['x'] < width * 0.4)) - - if is_target_voice: - # --- 遵循用户定义的明显规则 --- - # 检查下方是否有文字。如果有文字且距离很近,说明已经识别过了 - has_text_below = False - if i + 1 < len(sorted_ocr): - # 检查接下来的几行(最多3行),看是否有非语音文字 - # 微信转文字后,可能会有 1. 转换出的文字 2. "收起"按钮 3. "原文"按钮 - for j in range(i + 1, min(i + 4, len(sorted_ocr))): - next_item = sorted_ocr[j] - # 判定条件:1. 下一条不是语音 2. Y轴距离在合理范围 (300px内,转文字可能很长) 3. X轴位置相近 - y_dist = next_item['y'] - item['y'] - x_dist = abs(next_item['x'] - item['x']) - - if not next_item['is_voice'] and 0 < y_dist < 300: - if x_dist < 250 or "收起" in next_item['text'] or "原文" in next_item['text']: - has_text_below = True - # 如果是正常的文字内容(不是功能按钮),记录下来 - if "收起" not in next_item['text'] and "原文" not in next_item['text']: - dialogue_log.append(f"{speaker}: [语音] {next_item['text']}") - logger.info(f"语音 {item['text']} 下方已存在识别文字 '{next_item['text']}',跳过转换操作") - else: - dialogue_log.append(f"{speaker}: [语音] (已转换)") - logger.info(f"检测到转换控制按钮 '{next_item['text']}',认为已完成转换") - - # 既然已经转换了,如果有待点击的按钮,也应该清除掉,防止误点关闭 - convert_text_btn_rect = None - i = j # 跳过已处理的行 - break - if has_text_below: - i += 1 - continue - - # 如果下方没有文字,这才是真正需要点击转文字的对象 - if not has_text_below: - # 只要没识别,就锁定它。如果有多个没识别的,锁定最后一个(最下面的) - if not voice_msg_rect or item['y'] > voice_msg_rect['y']: - voice_msg_rect = item - dialogue_log.append(f"{speaker}: [语音] {item['text']} (待转换)") - logger.info(f"发现待转换语音: {item['text']} (y={item['y']})") - elif "转文字" in item['text']: - if not convert_text_btn_rect or item['y'] > convert_text_btn_rect['y']: - convert_text_btn_rect = item - # "转文字" 按钮不计入对话内容 - else: - dialogue_log.append(f"{speaker}: {item['text']}") - - i += 1 - - # 检查是否需要触发转文字 - has_unread_dot = False - # 检查转文字按钮或语音消息旁的红点 - target_item = convert_text_btn_rect if convert_text_btn_rect else voice_msg_rect - if target_item: - for dot in red_dots: - dist_x = abs(dot['center'][0] - (target_item['x'] + target_item['w'])) - dist_y = abs(dot['center'][1] - (target_item['y'] + target_item['h']//2)) - # 红点可能在左侧 (转文字按钮) 或右侧 (语音) - if (abs(dot['center'][0] - target_item['x']) < 150 or dist_x < 150) and dist_y < 100: - has_unread_dot = True - break - - # 兜底:如果是最后一条是语音且没文字,强制转换 - should_trigger_convert = has_unread_dot - if not should_trigger_convert and voice_msg_rect: - # 只要最后一条消息包含了这个语音的内容(且它是待转换状态),就触发 - # 修正原有的 endswith bug: 因为 dialogue_log 里的语音被标记了 "(待转换)" - if dialogue_log and (voice_msg_rect['text'] in dialogue_log[-1]) and "(待转换)" in dialogue_log[-1]: - logger.info(f"最后一条消息是语音({voice_msg_rect['text']})且未转换,强制触发转文字") - should_trigger_convert = True - - if should_trigger_convert: try: - d = u2.connect() - if convert_text_btn_rect: - tx = convert_text_btn_rect['x'] + convert_text_btn_rect['w'] // 2 - ty = convert_text_btn_rect['y'] + convert_text_btn_rect['h'] // 2 - logger.info(f"点击 '转文字' 按钮: ({tx}, {ty})") - d.click(tx, ty) - time.sleep(2.0) - return "VOICE_CONVERTING" - elif voice_msg_rect: - vx = voice_msg_rect['x'] + voice_msg_rect['w'] // 2 - vy = voice_msg_rect['y'] + voice_msg_rect['h'] // 2 + # 检查是否为聊天界面 + is_chat = result_data.get("is_chat_interface", False) + if not is_chat: + logger.warning("VLM 判断当前不是微信聊天界面") + return None, None + + if isinstance(result_data, list): + # 兼容旧格式 + messages = result_data + input_field_coordinates = None + else: + messages = result_data.get("messages", []) + input_field_coordinates = result_data.get("input_box") # input_box + + except Exception as e: + logger.error(f"解析 VLM 结果失败: {e}") + return [], None + + dialogue_log = [] + unconverted_voices = [] + + # 处理识别结果 + for msg in messages: + sender = msg.get('sender', '未知') + msg_type = msg.get('type', 'other') + content = msg.get('content', '') + coords = msg.get('center', [0, 0]) # center + status = msg.get('status', 'unconverted') + is_converted = (status == "converted") + + # 记录对话日志 + if msg_type == 'voice': + if is_converted: + dialogue_log.append(f"{sender}: [语音] {content} (已转换)") + else: + dialogue_log.append(f"{sender}: [语音] (待转换)") + # 将 center 转换为 coordinates 供后续使用 + msg['coordinates'] = coords + unconverted_voices.append(msg) + elif msg_type == 'text': + dialogue_log.append(f"{sender}: {content}") + + logger.info(f"VLM 识别: {sender} [{msg_type}] {content} (Converted: {is_converted})") + + # 处理未转换的语音消息 + if unconverted_voices: + logger.info(f"发现 {len(unconverted_voices)} 条未转换的语音消息,将仅处理最后一条...") + # 仅保留最后一条语音消息进行处理 + unconverted_voices = [unconverted_voices[-1]] + + # 使用传入的 device 或创建新连接 + d = device if device else u2.connect() + + for voice in unconverted_voices: + vx, vy = voice['coordinates'] logger.info(f"长按语音消息: ({vx}, {vy})") d.long_click(vx, vy, 1.5) time.sleep(1.0) - if d(text="转文字").exists: - d(text="转文字").click() + + # 查找“转文字” (使用 OCR) + menu_shot_path = os.path.join(os.path.dirname(image_path), "temp_menu_shot.jpg") + d.screenshot(menu_shot_path) + + # OCR 识别 + ocr_results = ocr_kit.read_text(menu_shot_path) + convert_btn_center = None + + for bbox, text, conf in ocr_results: + if "转文字" in text or "转换为文字" in text: + # bbox is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] + # Calculate center + c_x = int((bbox[0][0] + bbox[2][0]) / 2) + c_y = int((bbox[0][1] + bbox[2][1]) / 2) + convert_btn_center = (c_x, c_y) + break + + if convert_btn_center: + logger.info(f"OCR 找到 '转文字' 按钮: {convert_btn_center}") + d.click(convert_btn_center[0], convert_btn_center[1]) + should_trigger_convert = True + + # 动态等待: 60s语音约需10s转换,比例约 1/6 + duration_str = voice.get('content', '0').replace('"', '').strip() + try: + duration = int(duration_str) + except: + duration = 10 # 默认值 + + wait_seconds = max(2, duration / 5.0) # 稍微多等一点,用 /5.0 + logger.info(f"语音时长 {duration}s,预计等待转换 {wait_seconds:.1f}s...") + time.sleep(wait_seconds) + else: - # 截图找菜单 - temp_menu_path = take_debug_screenshot(d, "voice_menu_retry") - menu_results = reader.read_text(temp_menu_path) - for (m_bbox, m_text, m_prob) in menu_results: - if "转文字" in m_text: - m_tl, _, m_br, _ = m_bbox - d.click(int(m_tl[0] + (m_br[0]-m_tl[0])//2), int(m_tl[1] + (m_br[1]-m_tl[1])//2)) - break - time.sleep(2.0) - return "VOICE_CONVERTING" - except Exception as e: - logger.error(f"转换操作失败: {e}") + logger.warning("OCR 未找到 '转文字' 菜单项") + # 点击空白处关闭菜单,避免遮挡 + d.click(vx + 200, vy) + + if should_trigger_convert: + # 转换完成后稍微多等一下,确保 UI 刷新 + time.sleep(1.0) + return "VOICE_CONVERTING", input_field_coordinates + + return dialogue_log, input_field_coordinates - # 保存结果图 - if input_rect: - ix, iy, iw, ih = input_rect - cv2.rectangle(img, (ix, iy), (ix + iw, iy + ih), (255, 0, 0), 2) - try: - is_success, buffer = cv2.imencode(".jpg", img) - if is_success: - buffer.tofile(output_path) except Exception as e: - logger.error(f"保存失败: {e}") + logger.error(f"VLM 分析失败: {e}", exc_info=True) + return [], None + + +def clean_screenshots_dir(): + """清理截图目录""" + screenshot_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots") + if not os.path.exists(screenshot_dir): + os.makedirs(screenshot_dir) + return + + for f in os.listdir(screenshot_dir): + if f.lower().endswith(('.jpg', '.png', '.jpeg')): + try: + os.remove(os.path.join(screenshot_dir, f)) + except Exception as e: + logger.warning(f"Failed to delete {f}: {e}") + +def is_in_chat_interface(d): + """ + 检查是否在微信聊天界面 + """ + try: + # 1. 底部语音/键盘切换按钮 + if d(description="切换到语音").exists or d(description="切换到键盘").exists: + return True + # 2. 底部输入框 + if d(className="android.widget.EditText").exists: + return True + # 3. 底部“按住说话”按钮 + if d(text="按住说话").exists: + return True + # 4. 右上角更多按钮 + if d(description="聊天信息").exists: + return True + except Exception as e: + logger.warning(f"is_in_chat_interface check failed: {e}") + + return False + +def find_input_box_center(image_path): + """ + 寻找输入框中心坐标 (兜底策略) + 优先使用几何特征 (底部 88% 处) + """ + try: + if not os.path.exists(image_path): + return (540, 2100), None + + img = cv2.imread(image_path) + if img is None: + return (540, 2100), None + + h, w = img.shape[:2] + + # 策略:直接返回屏幕底部 88% 处的中心点 + center_x = int(w * 0.5) + center_y = int(h * 0.88) + + logger.info(f"find_input_box_center fallback: ({center_x}, {center_y})") + return (center_x, center_y), None + + except Exception as e: + logger.error(f"find_input_box_center error: {e}") + return (540, 2100), None + +def perform_input_action(d, center_point, text, auto_send=True): + """ + 执行输入操作 + """ + try: + # 1. 尝试找到原生输入框并输入 + edit_text = d(className="android.widget.EditText") + input_success = False + + if edit_text.exists: + logger.info("Found native EditText, using set_text") + try: + edit_text.click() + time.sleep(0.5) + edit_text.set_text(text) + input_success = True + except Exception as e: + logger.warning(f"Native input failed: {e}") + + # 2. 如果原生输入失败,使用坐标点击 + 粘贴/输入 + if not input_success: + cx, cy = center_point + logger.info(f"Using coordinate input: {center_point}") + d.click(cx, cy) + time.sleep(1.0) + + try: + d.send_keys(text) + except Exception: + logger.warning("send_keys failed, trying set_clipboard") + d.set_clipboard(text) + d.click(cx, cy) + time.sleep(0.5) + # 尝试粘贴 + d.press("paste") + + time.sleep(1.0) + + # 3. 发送 + if auto_send: + if d(text="发送").exists: + d(text="发送").click() + logger.info("Clicked '发送'") + else: + d.press("enter") + logger.info("Pressed Enter") + + except Exception as e: + logger.error(f"perform_input_action error: {e}") - return dialogue_log diff --git a/WeiXin/__pycache__/WxUtil.cpython-310.pyc b/WeiXin/__pycache__/WxUtil.cpython-310.pyc index f3562b6..fc36058 100644 Binary files a/WeiXin/__pycache__/WxUtil.cpython-310.pyc and b/WeiXin/__pycache__/WxUtil.cpython-310.pyc differ