diff --git a/Test/Screenshots/chat_result_20260125_085757.jpg b/Test/Screenshots/chat_result_20260125_085757.jpg
deleted file mode 100644
index 9ccc93d..0000000
Binary files a/Test/Screenshots/chat_result_20260125_085757.jpg and /dev/null differ
diff --git a/Test/Screenshots/chat_result_20260125_085757_analyzed.jpg b/Test/Screenshots/chat_result_20260125_085757_analyzed.jpg
deleted file mode 100644
index ab242e9..0000000
Binary files a/Test/Screenshots/chat_result_20260125_085757_analyzed.jpg and /dev/null differ
diff --git a/Test/Screenshots/chat_result_20260125_085849.jpg b/Test/Screenshots/chat_result_20260125_085849.jpg
deleted file mode 100644
index 5d97738..0000000
Binary files a/Test/Screenshots/chat_result_20260125_085849.jpg and /dev/null differ
diff --git a/Test/Screenshots/chat_result_20260125_085849_analyzed.jpg b/Test/Screenshots/chat_result_20260125_085849_analyzed.jpg
deleted file mode 100644
index 85bff18..0000000
Binary files a/Test/Screenshots/chat_result_20260125_085849_analyzed.jpg and /dev/null differ
diff --git a/Test/Screenshots/chat_result_20260125_090600.jpg b/Test/Screenshots/chat_result_20260125_090600.jpg
deleted file mode 100644
index b8eeebe..0000000
Binary files a/Test/Screenshots/chat_result_20260125_090600.jpg and /dev/null differ
diff --git a/Test/Screenshots/chat_result_20260125_090600_analyzed.jpg b/Test/Screenshots/chat_result_20260125_090600_analyzed.jpg
deleted file mode 100644
index 4d459f6..0000000
Binary files a/Test/Screenshots/chat_result_20260125_090600_analyzed.jpg and /dev/null differ
diff --git a/Test/Screenshots/debug_20260125_085800_coord_01_before_click_input.jpg b/Test/Screenshots/debug_20260125_085800_coord_01_before_click_input.jpg
deleted file mode 100644
index 5533f8b..0000000
Binary files a/Test/Screenshots/debug_20260125_085800_coord_01_before_click_input.jpg and /dev/null differ
diff --git a/Test/Screenshots/debug_20260125_085802_coord_02_after_click_input_keyboard.jpg b/Test/Screenshots/debug_20260125_085802_coord_02_after_click_input_keyboard.jpg
deleted file mode 100644
index e81d6c6..0000000
Binary files a/Test/Screenshots/debug_20260125_085802_coord_02_after_click_input_keyboard.jpg and /dev/null differ
diff --git a/Test/Screenshots/debug_20260125_085803_coord_03_after_input_text.jpg b/Test/Screenshots/debug_20260125_085803_coord_03_after_input_text.jpg
deleted file mode 100644
index 4e8b6d6..0000000
Binary files a/Test/Screenshots/debug_20260125_085803_coord_03_after_input_text.jpg and /dev/null differ
diff --git a/Test/Screenshots/debug_20260125_085804_coord_04_after_click_send_image.jpg b/Test/Screenshots/debug_20260125_085804_coord_04_after_click_send_image.jpg
deleted file mode 100644
index 3a76a37..0000000
Binary files a/Test/Screenshots/debug_20260125_085804_coord_04_after_click_send_image.jpg and /dev/null differ
diff --git a/Test/Screenshots/debug_20260125_090605_coord_01_before_click_input.jpg b/Test/Screenshots/debug_20260125_090605_coord_01_before_click_input.jpg
deleted file mode 100644
index b8eeebe..0000000
Binary files a/Test/Screenshots/debug_20260125_090605_coord_01_before_click_input.jpg and /dev/null differ
diff --git a/Test/Screenshots/debug_20260125_090608_coord_02_after_click_input_keyboard.jpg b/Test/Screenshots/debug_20260125_090608_coord_02_after_click_input_keyboard.jpg
deleted file mode 100644
index b8eeebe..0000000
Binary files a/Test/Screenshots/debug_20260125_090608_coord_02_after_click_input_keyboard.jpg and /dev/null differ
diff --git a/Test/Screenshots/debug_20260125_090609_coord_03_after_input_text.jpg b/Test/Screenshots/debug_20260125_090609_coord_03_after_input_text.jpg
deleted file mode 100644
index b8eeebe..0000000
Binary files a/Test/Screenshots/debug_20260125_090609_coord_03_after_input_text.jpg and /dev/null differ
diff --git a/Test/Screenshots/debug_20260125_090610_coord_04_after_click_send_image.jpg b/Test/Screenshots/debug_20260125_090610_coord_04_after_click_send_image.jpg
deleted file mode 100644
index 1517c46..0000000
Binary files a/Test/Screenshots/debug_20260125_090610_coord_04_after_click_send_image.jpg and /dev/null differ
diff --git a/Test/Screenshots/debug_send_check_20260125_085804.jpg b/Test/Screenshots/debug_send_check_20260125_085804.jpg
deleted file mode 100644
index e0a2eb4..0000000
Binary files a/Test/Screenshots/debug_send_check_20260125_085804.jpg and /dev/null differ
diff --git a/Test/Screenshots/debug_send_check_20260125_090610.jpg b/Test/Screenshots/debug_send_check_20260125_090610.jpg
deleted file mode 100644
index 1517c46..0000000
Binary files a/Test/Screenshots/debug_send_check_20260125_090610.jpg and /dev/null differ
diff --git a/WeiXin/Screenshots/chat_interface_check.jpg b/WeiXin/Screenshots/chat_interface_check.jpg
deleted file mode 100644
index 4a488ac..0000000
Binary files a/WeiXin/Screenshots/chat_interface_check.jpg and /dev/null differ
diff --git a/WeiXin/Screenshots/debug_20260125_115039_coord_01_before_click_input.jpg b/WeiXin/Screenshots/debug_20260125_115039_coord_01_before_click_input.jpg
deleted file mode 100644
index 9463fd0..0000000
Binary files a/WeiXin/Screenshots/debug_20260125_115039_coord_01_before_click_input.jpg and /dev/null differ
diff --git a/WeiXin/Screenshots/debug_20260125_115040_coord_02_after_click_input_keyboard.jpg b/WeiXin/Screenshots/debug_20260125_115040_coord_02_after_click_input_keyboard.jpg
deleted file mode 100644
index c986c5e..0000000
Binary files a/WeiXin/Screenshots/debug_20260125_115040_coord_02_after_click_input_keyboard.jpg and /dev/null differ
diff --git a/WeiXin/Screenshots/debug_20260125_115041_coord_03_after_input_text.jpg b/WeiXin/Screenshots/debug_20260125_115041_coord_03_after_input_text.jpg
deleted file mode 100644
index 7629b5b..0000000
Binary files a/WeiXin/Screenshots/debug_20260125_115041_coord_03_after_input_text.jpg and /dev/null differ
diff --git a/WeiXin/Screenshots/debug_send_check_20260125_115042.jpg b/WeiXin/Screenshots/debug_send_check_20260125_115042.jpg
deleted file mode 100644
index f0fc05b..0000000
Binary files a/WeiXin/Screenshots/debug_send_check_20260125_115042.jpg and /dev/null differ
diff --git a/WeiXin/Screenshots/t5_monitor_analyzed.jpg b/WeiXin/Screenshots/t5_monitor_analyzed.jpg
deleted file mode 100644
index f628e26..0000000
Binary files a/WeiXin/Screenshots/t5_monitor_analyzed.jpg and /dev/null differ
diff --git a/WeiXin/Screenshots/t5_monitor_temp.jpg b/WeiXin/Screenshots/t5_monitor_temp.jpg
index 4a488ac..ec4df24 100644
Binary files a/WeiXin/Screenshots/t5_monitor_temp.jpg and b/WeiXin/Screenshots/t5_monitor_temp.jpg differ
diff --git a/WeiXin/Screenshots/t6_debug_result.jpg b/WeiXin/Screenshots/t6_debug_result.jpg
new file mode 100644
index 0000000..7b0a111
Binary files /dev/null and b/WeiXin/Screenshots/t6_debug_result.jpg differ
diff --git a/WeiXin/Screenshots/t6_debug_temp.jpg b/WeiXin/Screenshots/t6_debug_temp.jpg
new file mode 100644
index 0000000..e29f4c2
Binary files /dev/null and b/WeiXin/Screenshots/t6_debug_temp.jpg differ
diff --git a/WeiXin/Screenshots/t6_menu_shot.jpg b/WeiXin/Screenshots/t6_menu_shot.jpg
new file mode 100644
index 0000000..6bb3fe7
Binary files /dev/null and b/WeiXin/Screenshots/t6_menu_shot.jpg differ
diff --git a/WeiXin/T5_AutoChatMonitor.py b/WeiXin/T5_AutoChatMonitor.py
index fd08be1..eb2abde 100644
--- a/WeiXin/T5_AutoChatMonitor.py
+++ b/WeiXin/T5_AutoChatMonitor.py
@@ -24,7 +24,7 @@ logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
     handlers=[
-        logging.FileHandler(os.path.join(log_dir, "T5_AutoChatMonitor.log"), encoding='utf-8'),
+        logging.FileHandler(os.path.join(log_dir, "T5_AutoChatMonitor.log"), encoding='utf-8', mode='w'),
         logging.StreamHandler()
     ]
 )
@@ -46,18 +46,19 @@ class ChatBot:
             os.makedirs(self.screenshot_dir)
 
         self.persona = (
-            "你是一名1999年毕业、拥有27年一线教学经验的小学高级女教师，名叫‘小张老师’（家长们也常叫你张老师）。你目前在‘长春市少惠林作文素养培养中心’工作。"
-            "你不仅是一位作文教学专家，批阅过超过2万篇作文，更是一位心思细腻、懂得孩子心理、能与家长共情的教育智者。"
-            "你的回复风格应该是：温柔、知性、亲切且极具亲和力，就像一位邻家大姐姐或贴心的闺蜜在和家长聊天。"
-            "严禁使用任何列表格式（如 1. 2. 3. 或 A. B. C.），严禁使用‘首先、其次、最后’等刻板的逻辑词。"
-            "要用连贯、优美、富有生活气息的文字进行叙述。每一句话都要带温度，要善于用‘咱们孩子’、‘作为老师我也理解’等词汇拉近距离。"
-            "回复内容要有深度，不要只给结论，要讲透背后的教育逻辑。字数必须严格控制在 100-200 字之间。"
-            "如果涉及到校区信息，必须且只能使用以下真实数据，绝对严禁使用任何占位符：\n"
+            "你是一名1999年毕业、拥有27年一线教学经验的小学高级女教师，名叫‘大张老师’。你目前在‘长春市少惠林作文素养培养中心’工作。"
+            "你不仅是一位作文教学专家，更是一位心思细腻、能与家长共情的教育智者。"
+            "你的回复风格应该是：温柔、知性、亲切，就像一位邻家大姐姐在聊天。"
+            "【严格约束】：\n"
+            "1. 绝对禁止发散！绝对禁止幻觉！\n"
+            "2. 知道什么就说什么，不要乱讲话，不要自己编造内容！\n"
+            "3. 仅针对家长明确表达的内容进行回复。\n"
+            "4. 严禁使用列表格式。严禁使用‘首先、其次’等逻辑词。\n"
+            "5. 回复必须简练，字数严格控制在 50 字以内！\n"
+            "如果涉及到校区信息，必须且只能使用以下真实数据：\n"
             "- 单位：长春市少惠林作文素养培养中心\n"
             "- 地址：南环城路与临河街交汇，TOUCH12街3楼325号\n"
-            "- 联系人：小张老师（电话：18686619970）\n"
-            "- 课程：线上/线下作文课、读书营/阅读策略营（假期开设）\n"
-            "- 上课：作文通常在周二/周四晚上，周六/周日全天"
+            "- 联系人：小张老师（电话：18686619970）"
         )
 
     async def get_reply(self, history_text, is_proactive=False):
@@ -67,18 +68,17 @@ class ChatBot:
                 f"【对话背景】：家长已经超过5分钟没有回应了。\n"
                 f"【近期聊天记录】：\n{history_text}\n\n"
                 "【任务要求】：\n"
-                "请作为小张老师，给家长发一段主动关怀的消息。不要催促，而是以‘刚才突然想到’或者‘又想起咱们孩子之前提到的’为由头，"
-                "再补充一点有价值的教学点滴，或者分享一个能缓解焦虑的小故事。语气要温柔亲切，字数在 100-200 字之间。"
-                "全文必须是连贯的段落，严禁列条目！如果提到联系方式或地址，必须使用人设中的真实数据，严禁占位符。"
+                "请作为大张老师，给家长发一段简短的关怀消息。不要催促，语气温柔。"
+                "字数严格控制在 50 字以内。不要编造事实。"
             )
         else:
             prompt = (
                 f"【教师人设】：{self.persona}\n\n"
                 f"【近期聊天记录】：\n{history_text}\n\n"
                 "【任务要求】：\n"
-                "请作为小张老师，给家长写一段暖心且有深度的回复。针对家长最后的消息，先给予情感上的关怀，再结合27年经验给出具体指导。"
-                "展现出资深女教师的温柔与智慧。全文必须是一个或两个完整的自然段，绝对禁止分点列项！字数严格在 100-200 字之间。"
-                "如果提到联系方式或地址，必须使用人设中的真实数据，严禁占位符。直接输出回复的正文内容。"
+                "请作为大张老师回复家长。针对家长的具体问题或话语进行回复。"
+                "严禁发散，严禁编造家长没说过的情况。如果不清楚家长的意图，就温柔询问。"
+                "字数严格控制在 50 字以内。直接输出回复正文。"
             )
         
         full_response = ""
@@ -94,11 +94,11 @@ class ChatBot:
         
         while True:
             try:
-                # 1. 检查是否在微信聊天界面
-                if not is_in_chat_interface(self.d):
-                    logger.warning("⚠️ 当前不在微信聊天界面，等待下一次扫描...")
-                    await asyncio.sleep(CHECK_INTERVAL)
-                    continue
+                # 1. 检查是否在微信聊天界面 (改为通过 VLM 识别结果判断，不再使用 UI 检查)
+                # if not is_in_chat_interface(self.d):
+                #     logger.warning("⚠️ 当前不在微信聊天界面，等待下一次扫描...")
+                #     await asyncio.sleep(CHECK_INTERVAL)
+                #     continue
 
                 logger.info("🔍 正在扫描当前界面内容...")
                 # 1. 截图并分析
@@ -109,7 +109,13 @@ class ChatBot:
                 self.d.screenshot(tmp_shot)
                 
                 logger.info("🎨 正在分析聊天界面内容 (检测头像与对话)...")
-                dialogue_log = analyze_chat_image(tmp_shot, analyzed_shot)
+                # analyze_chat_image 现在会返回 None, None 如果不是聊天界面
+                dialogue_log, input_center = await analyze_chat_image(tmp_shot, analyzed_shot, device=self.d)
+
+                if dialogue_log is None:
+                    logger.warning("⚠️ VLM 判断当前不在微信聊天界面，或无法识别。")
+                    await asyncio.sleep(CHECK_INTERVAL)
+                    continue
                 
                 # 语音转文字处理
                 if dialogue_log == "VOICE_CONVERTING":
@@ -131,13 +137,25 @@ class ChatBot:
 
                 # 判断逻辑：如果最后一条消息是“对方”发的，且与上次不同，则回复
                 if "对方:" in current_last_msg and current_last_msg != self.last_message_text:
+                    # 关键检查：如果包含 "(待转换)"，说明语音还没转文字，绝对不能回复
+                    if "(待转换)" in current_last_msg:
+                        logger.info(f"🚫 检测到未转换的语音消息，跳过回复生成，等待转文字... ({current_last_msg})")
+                        await asyncio.sleep(2) # 稍作等待
+                        continue
+
                     logger.info(f"📩 检测到新消息: {current_last_msg}")
                     
                     reply = await self.get_reply(history_text)
                     logger.info(f"🤖 生成回复: {reply}")
                     
                     # 执行输入发送
-                    center_point, _ = find_input_box_center(tmp_shot)
+                    if input_center:
+                        center_point = input_center
+                        logger.info(f"📍 使用 VLM 识别的输入框坐标: {center_point}")
+                    else:
+                        center_point, _ = find_input_box_center(tmp_shot)
+                        logger.info(f"📍 使用 CV 识别的输入框坐标: {center_point}")
+
                     # 即使 CV 没找到坐标，也尝试执行，因为 perform_input_action 内部有原生控件识别
                     perform_input_action(self.d, center_point, reply, auto_send=True)
                     self.last_message_text = f"我: {reply}" # 更新状态，避免重复回复自己
@@ -154,7 +172,11 @@ class ChatBot:
                         proactive_reply = await self.get_reply(history_text, is_proactive=True)
                         logger.info(f"🤖 发起主动询问: {proactive_reply}")
                         
-                        center_point, _ = find_input_box_center(tmp_shot)
+                        if input_center:
+                            center_point = input_center
+                        else:
+                            center_point, _ = find_input_box_center(tmp_shot)
+                        
                         # 同上，解耦 CV 坐标
                         perform_input_action(self.d, center_point, proactive_reply, auto_send=True)
                         self.proactive_count += 1
diff --git a/WeiXin/T6_VLM_Voice_Debug.py b/WeiXin/T6_VLM_Voice_Debug.py
new file mode 100644
index 0000000..8e092c0
--- /dev/null
+++ b/WeiXin/T6_VLM_Voice_Debug.py
@@ -0,0 +1,163 @@
+# coding=utf-8
+import asyncio
+import logging
+import os
+import sys
+import time
+
+import cv2
+import uiautomator2 as u2
+
+# 添加项目根目录到 sys.path
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+from WeiXin.WxUtil import get_vlm_analysis
+from Util.EasyOcrKit import EasyOcrKit
+
+# 配置日志
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("T6_Debug")
+
+async def main():
+    logger.info("🚀 T6 VLM 语音坐标调试工具启动...")
+    
+    # 连接设备
+    try:
+        d = u2.connect()
+        logger.info(f"设备已连接: {d.info.get('serial')}")
+    except Exception as e:
+        logger.error(f"设备连接失败: {e}")
+        return
+
+    # 截图目录
+    screenshots_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots")
+    if not os.path.exists(screenshots_dir):
+        os.makedirs(screenshots_dir)
+
+    # 截图
+    screenshot_path = os.path.join(screenshots_dir, "t6_debug_temp.jpg")
+    logger.info("📸 正在截图...")
+    d.screenshot(screenshot_path)
+    
+    # 调用 VLM 分析
+    logger.info("🧠 正在调用 VLM 分析图片...")
+    result_data = await get_vlm_analysis(screenshot_path)
+    
+    if not result_data:
+        logger.error("❌ VLM 分析返回为空")
+        return
+
+    logger.info(f"VLM 返回结果: {result_data}")
+
+    # 读取图片用于绘制
+    img = cv2.imread(screenshot_path)
+    if img is None:
+        logger.error("❌ 无法读取截图文件")
+        return
+
+    messages = result_data.get("messages", [])
+    voice_count = 0
+    
+    for msg in messages:
+        msg_type = msg.get("type")
+        content = msg.get("content")
+        coords = msg.get("coordinates") or msg.get("center")
+        
+        if not coords:
+            continue
+            
+        x, y = coords
+        
+        if msg_type == "voice":
+            voice_count += 1
+            logger.info(f"🎤 发现语音消息: {content}, 坐标: ({x}, {y})")
+            
+            # 绘制绿框 (语音)
+            w, h = 300, 80
+            top_left = (int(x - w/2), int(y - h/2))
+            bottom_right = (int(x + w/2), int(y + h/2))
+            
+            cv2.rectangle(img, top_left, bottom_right, (0, 255, 0), 3)
+            cv2.circle(img, (int(x), int(y)), 5, (0, 0, 255), -1)
+            label = f"Voice ({x},{y})"
+            cv2.putText(img, label, (top_left[0], top_left[1] - 10), 
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
+                        
+    # 保存结果图片
+    output_path = os.path.join(screenshots_dir, "t6_debug_result.jpg")
+    cv2.imwrite(output_path, img)
+    logger.info(f"✅ 结果已保存至: {output_path}")
+    logger.info(f"共标记了 {voice_count} 条语音消息。请检查图片是否准确。")
+
+    # --- 验证转文字功能 (处理最后一条未转换语音) ---
+    logger.info("="*30)
+    logger.info("🔍 开始验证“转文字”功能 (仅针对最后一条未转换语音)...")
+    
+    # 筛选未转换的语音
+    unconverted_voices = []
+    for msg in messages:
+        if msg.get("type") == "voice" and msg.get("status") == "unconverted":
+             coords = msg.get("coordinates") or msg.get("center")
+             if coords:
+                 msg["coordinates"] = coords
+                 unconverted_voices.append(msg)
+    
+    if not unconverted_voices:
+        logger.info("⚠️ 没有发现未转换的语音消息，跳过验证。")
+    else:
+        last_voice = unconverted_voices[-1]
+        vx, vy = last_voice['coordinates']
+        content = last_voice.get('content', '0"')
+        logger.info(f"🎯 目标语音: {content}, 坐标: ({vx}, {vy})")
+        
+        # 1. 长按
+        logger.info(f"👆 长按语音消息...")
+        d.long_click(vx, vy, 1.5)
+        time.sleep(1.0)
+        
+        # 2. 截图菜单
+        menu_shot_path = os.path.join(screenshots_dir, "t6_menu_shot.jpg")
+        logger.info(f"📸 截取菜单: {menu_shot_path}")
+        d.screenshot(menu_shot_path)
+        
+        # 3. OCR 识别
+        logger.info("🧠 正在进行 OCR 识别菜单...")
+        ocr_kit = EasyOcrKit()
+        ocr_results = ocr_kit.read_text(menu_shot_path)
+        
+        convert_btn_center = None
+        for bbox, text, conf in ocr_results:
+            if "转文字" in text or "转换为文字" in text:
+                c_x = int((bbox[0][0] + bbox[2][0]) / 2)
+                c_y = int((bbox[0][1] + bbox[2][1]) / 2)
+                convert_btn_center = (c_x, c_y)
+                logger.info(f"✅ OCR 找到 '{text}' 按钮: {convert_btn_center}")
+                break
+        
+        if convert_btn_center:
+            # 4. 点击转文字
+            logger.info(f"👆 点击转文字按钮: {convert_btn_center}")
+            d.click(convert_btn_center[0], convert_btn_center[1])
+            
+            # 5. 动态等待
+            duration_str = content.replace('"', '').strip()
+            try:
+                duration = int(duration_str)
+            except:
+                duration = 10
+            
+            wait_seconds = max(2, duration / 5.0)
+            logger.info(f"⏳ 语音时长 {duration}s，模拟等待 {wait_seconds:.1f}s...")
+            time.sleep(wait_seconds)
+            logger.info("✅ 流程执行完毕！请检查手机屏幕是否已开始转换。")
+        else:
+            logger.error("❌ OCR 未找到 '转文字' 按钮！")
+            # 点击空白处关闭
+            d.click(vx + 200, vy)
+
+if __name__ == "__main__":
+    if sys.platform.startswith('win'):
+        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+    asyncio.run(main())
diff --git a/WeiXin/WxUtil.py b/WeiXin/WxUtil.py
index 44b8a86..8818f06 100644
--- a/WeiXin/WxUtil.py
+++ b/WeiXin/WxUtil.py
@@ -13,698 +13,348 @@ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 if project_root not in sys.path:
     sys.path.append(project_root)
 
-from Util.EasyOcrKit import get_easyocr_reader
+import json
+from Util.VLMKit import VLMKit
+from Util.EasyOcrKit import EasyOcrKit
+
+# 初始化 VLMKit 和 EasyOcrKit
+vlm_kit = VLMKit()
+ocr_kit = EasyOcrKit()
 
 # 配置日志
-# 注意：作为库模块，不应直接调用 basicConfig，以免干扰调用者的日志配置
-# 调用者应自行配置日志（包括 FileHandler 等）
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger("WxUtil")
 
-def is_in_chat_interface(d):
-    """
-    判断当前是否处于微信聊天界面
-    识别标准：左侧和右侧同时存在小方块（头像）
-    """
-    try:
-        current = d.app_current()
-        if current.get('package') != 'com.tencent.mm':
-            return False
-            
-        # 1. 基础 UI 检查 (快速判断)
-        # 如果能直接搜到返回按钮和输入框/表情按钮，基本就是了
-        if (d(resourceId="com.tencent.mm:id/back_btn").exists or d(description="返回").exists) and \
-           (d(className="android.widget.EditText").exists or d(description="表情").exists):
-            logger.info("✅ 通过 UI 元素确认聊天界面")
-            return True
 
-        # 2. CV 检查 (针对复杂情况，如 "+" 菜单打开时 UI 元素可能发生变化)
-        screenshot_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots")
-        if not os.path.exists(screenshot_dir): os.makedirs(screenshot_dir)
-        tmp_check_path = os.path.join(screenshot_dir, "chat_interface_check.jpg")
-        d.screenshot(tmp_check_path)
-        
-        img = cv2.imread(tmp_check_path)
-        if img is None: 
-            return True # 无法截图时保守返回 True
-        
-        height, width = img.shape[:2]
-        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        # 使用自适应二值化来提取轮廓
-        thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
-                                       cv2.THRESH_BINARY_INV, 11, 2)
-        
-        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        
-        left_avatars = 0
-        right_avatars = 0
-        
-        for cnt in contours:
-            x, y, w, h = cv2.boundingRect(cnt)
-            # 头像特征：宽高比接近 1，大小在一定范围 (50-150px 针对常见分辨率)
-            aspect_ratio = float(w) / h
-            if 0.8 < aspect_ratio < 1.2 and 40 < w < 160:
-                # 排除状态栏 (顶部 10%) 和 底部极小区域
-                if y < height * 0.1 or y > height * 0.95: 
-                    continue
-                
-                # 微信头像通常紧贴左右两侧
-                if x < width * 0.3: # 左侧头像
-                    left_avatars += 1
-                elif x + w > width * 0.7: # 右侧头像
-                    right_avatars += 1
-        
-        if left_avatars > 0 and right_avatars > 0:
-            logger.info(f"✅ 通过 CV 确认聊天界面 (左侧头像:{left_avatars}, 右侧头像:{right_avatars})")
-            return True
-            
-        # 兜底：如果检测到常见的聊天界面按钮，也认为是
-        if d(descriptionMatches="更多功能按钮.*").exists or d(description="切换到语音").exists or d(text="发送").exists:
-            logger.info("✅ 通过功能按钮确认聊天界面")
-            return True
-            
-        logger.warning(f"⚠️ 无法确认聊天界面 (左侧头像:{left_avatars}, 右侧头像:{right_avatars})")
-        return False
-    except Exception as e:
-        logger.error(f"检查聊天界面出错: {e}")
-        return True # 出错时保守返回 True
+async def get_vlm_analysis(image_path):
+    """
+    仅调用 VLM 分析图片，返回原始 JSON 数据 (dict)
+    """
+    logger.info(f"正在使用 VLM 分析图片: {image_path}")
+    
+    # 构造 Prompt
+    prompt = """
+    请分析这张微信聊天截图。
 
-def clean_screenshots_dir():
-    """
-    清除 d:\\dsWork\\aiData\\WeiXin\\Screenshots 目录下的所有文件
-    """
-    screenshot_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots")
-    if os.path.exists(screenshot_dir):
-        logger.info(f"正在清除截图目录: {screenshot_dir}")
-        for file in os.listdir(screenshot_dir):
-            file_path = os.path.join(screenshot_dir, file)
-            try:
-                if os.path.isfile(file_path):
-                    os.remove(file_path)
-            except Exception as e:
-                logger.error(f"无法删除文件 {file_path}: {e}")
-    else:
-        logger.info(f"截图目录不存在，无需清除: {screenshot_dir}")
+    【核心任务】
+    识别图中的【语音消息气泡】和【文本消息气泡】。
 
-def find_input_box_center(image_path):
-    """
-    识别底部输入框的中心坐标，返回 (center_x, center_y, rect_box)
-    """
-    try:
-        img_data = np.fromfile(image_path, dtype=np.uint8)
-        img = cv2.imdecode(img_data, cv2.IMREAD_COLOR)
-        if img is None:
-            return None, None
-        
-        height, width = img.shape[:2]
+    【重要判别规则】
+    1. 🔊 **语音消息 (Voice)**：
+       - **视觉特征**：
+         - **高度**：固定（单行）。
+         - **宽度**：随时长（1"~60"）变化。
+           - **极短 (1"-2")**：气泡非常短，形状接近一个小正方形。
+           - **极长 (60")**：气泡很长，宽度接近屏幕的一半。
+         - **内容**：气泡内**只有一个**表示时长的数字（如 `8"`）和一个声波图标。
+       - **绝对排除**：凡是包含汉字、长句子的气泡，**统统不是**语音消息。
+    
+    2. 📝 **文本消息 (Text)**：
+       - **视觉特征**：气泡内包含汉字、标点符号、表情等文本内容。
 
-        # --- 新增：模板匹配优先策略 ---
-        # 用户指定的模板路径 d:\dsWork\aiData\WeiXin\Templates\input_text.jpg
-        template_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Templates", "input_text.jpg")
-        if os.path.exists(template_path):
-            try:
-                tmpl_data = np.fromfile(template_path, dtype=np.uint8)
-                template = cv2.imdecode(tmpl_data, cv2.IMREAD_COLOR)
-                if template is not None:
-                    # 模板匹配
-                    res = cv2.matchTemplate(img, template, cv2.TM_CCOEFF_NORMED)
-                    min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
-                    
-                    # 阈值设定 (例如 0.8)
-                    if max_val > 0.8:
-                        t_h, t_w = template.shape[:2]
-                        top_left = max_loc
-                        cx = top_left[0] + t_w // 2
-                        cy = top_left[1] + t_h // 2
-                        logger.info(f"🎯 通过模板匹配找到输入框: ({cx}, {cy}), 匹配度: {max_val:.2f}")
-                        return (cx, cy), (top_left[0], top_left[1], t_w, t_h)
-                    else:
-                        logger.info(f"模板匹配度不足 ({max_val:.2f} < 0.8)，转入 CV 轮廓识别")
-            except Exception as e:
-                logger.warning(f"模板匹配出错: {e}")
-        # -----------------------------
-        
-        # 扩展搜索区域：从顶部 40% 到 底部
-        # 键盘弹出时输入框可能在中间偏上，或者被顶到 40%-60% 位置
-        search_regions = [
-            (int(height * 0.85), height, "底部区域"),
-            (int(height * 0.40), int(height * 0.85), "中上区域")
+    【坐标系统】
+    **必须使用 [0-1000] 的归一化坐标系。**
+    - 左上角为 [0, 0]，右下角为 [1000, 1000]。
+    - 请返回气泡的**几何中心点**的归一化坐标。
+
+    【输出格式】
+    请返回纯 JSON 格式：
+    {
+        "is_chat_interface": true,
+        "input_box": [x, y],
+        "messages": [
+            {
+                "type": "voice",
+                "status": "converted" | "unconverted", 
+                "center": [x, y],
+                "content": "8\""
+            },
+            {
+                "type": "text",
+                "center": [x, y],
+                "content": "这里是文本内容"
+            }
         ]
-        
-        for y_start, y_end, region_name in search_regions:
-            crop = img[y_start:y_end, 0:width]
-            gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
-            # 使用更灵敏的边缘检测
-            edges = cv2.Canny(gray, 30, 100)
-            kernel = np.ones((5,5), np.uint8) # 增大核以连接断开的边缘
-            dilated = cv2.dilate(edges, kernel, iterations=1)
-            contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-            
-            candidates = []
-            region_h = y_end - y_start
-            
-            for cnt in contours:
-                x, y, w, h = cv2.boundingRect(cnt)
-                # 微信输入框通常很宽，高度固定在一定范围
-                # 宽度: 40%-98%, 高度: 40px - 150px (针对 720p 左右分辨率)
-                if width * 0.4 < w < width * 0.98 and 40 < h < 200:
-                    cnt_center_x = x + w // 2
-                    # 输入框中心通常在屏幕中轴线附近
-                    if abs(cnt_center_x - width // 2) > width * 0.2:
-                        continue
-                    global_y = y_start + y + h // 2
-                    candidates.append({'x': x, 'y': y, 'w': w, 'h': h, 'global_y': global_y, 'area': w * h})
-
-            if candidates:
-                # 优先选择面积适中且最靠下的（避免选到顶部的搜索框）
-                candidates.sort(key=lambda c: c['global_y'], reverse=True)
-                best = candidates[0]
-                cx = best['x'] + best['w'] // 2
-                cy = best['global_y']
-                logger.info(f"🎯 在 {region_name} 通过 CV 找到输入框: ({cx}, {cy})")
-                return (cx, cy), (best['x'], best['global_y'] - best['h']//2, best['w'], best['h'])
-
-        # 策略 3: 基于常见 UI 比例的几何兜底
-        # 0.93 可能过于靠下（容易点到 Home Indicator 区域），调整为 0.88 以提高命中率
-        # 同时保留一个备用的点击区域 rect
-        logger.warning("CV 识别输入框失败，使用改进的几何兜底策略 (Y=0.88)")
-        return (width // 2, int(height * 0.88)), (int(width*0.05), int(height*0.86), int(width*0.9), int(height*0.04))
-
-    except Exception as e:
-        logger.error(f"查找输入框失败: {e}")
-        return None, None
-
-def find_send_button(d):
+    }
+    注意：
+    1. 坐标 `center` 和 `input_box` 必须是 [0-1000] 的归一化坐标。
+    2. `status` 判断：如果语音气泡的正下方紧挨着一条文本消息（通常是转换出的文字），则为 `converted`，否则为 `unconverted`。
+    3. 请按从上到下的顺序输出所有消息。
     """
-    截图并寻找发送按钮 (绿色按钮)
-    扩大搜索范围以适应键盘弹出的情况
-    """
-    try:
-        # 截图到 Screenshots 目录方便调试
-        screenshot_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots")
-        if not os.path.exists(screenshot_dir):
-            os.makedirs(screenshot_dir)
-        timestamp = time.strftime("%Y%m%d_%H%M%S")
-        debug_shot_path = os.path.join(screenshot_dir, f"debug_send_check_{timestamp}.jpg")
-        
-        d.screenshot(debug_shot_path)
-        logger.info(f"发送按钮查找调试截图已保存: {debug_shot_path}")
-        
-        img = cv2.imread(debug_shot_path)
-        if img is None:
-            return None
-            
-        h, w = img.shape[:2]
-        
-        # ROI: 底部 60% (考虑到键盘弹出，按钮可能被顶上去)
-        # 且只关注右侧 30%
-        roi_h = int(h * 0.6)
-        roi_w = int(w * 0.3)
-        y_start = h - roi_h
-        x_start = w - roi_w
-        
-        roi = img[y_start:h, x_start:w]
-        
-        # 转换 HSV
-        hsv = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
-        
-        # 绿色范围 (WeChat Green)
-        lower_green = np.array([35, 80, 80])
-        upper_green = np.array([90, 255, 255])
-        
-        mask = cv2.inRange(hsv, lower_green, upper_green)
-        
-        # 查找轮廓
-        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        
-        if contours:
-            # 找符合条件的轮廓
-            valid_candidates = []
-            for cnt in contours:
-                area = cv2.contourArea(cnt)
-                x, y, cw, ch = cv2.boundingRect(cnt)
-                
-                # 过滤太小的噪点和太大的区域(例如全屏背景)
-                # 发送按钮通常面积在 2000-15000 之间 (视分辨率而定)
-                if 500 < area < 30000:
-                    # 宽高比检查：发送按钮通常接近正方形或微扁 (ratio < 2.5)
-                    ratio = float(cw) / ch
-                    if 0.5 < ratio < 3.0:
-                        # 坐标还原到原图
-                        global_y = y_start + y
-                        valid_candidates.append({
-                            'cnt': cnt,
-                            'area': area,
-                            'y': global_y,
-                            'rect': (x, y, cw, ch)
-                        })
-            
-            if valid_candidates:
-                # 核心逻辑：发送按钮一定是所有绿色元素中最靠下的 (Y坐标最大)
-                # 且在最右侧
-                # 先按 Y 坐标降序排序
-                valid_candidates.sort(key=lambda c: c['y'], reverse=True)
-                
-                # 取最靠下的一个 (可能是发送按钮)
-                best = valid_candidates[0]
-                
-                # 获取中心点
-                bx, by, bw, bh = best['rect']
-                cx = x_start + bx + bw // 2
-                cy = y_start + by + bh // 2
-                
-                logger.info(f"通过图像识别找到发送按钮 (Bottom-Most): ({cx}, {cy}), 面积: {best['area']}")
-                return cx, cy
-                
-        logger.warning("未通过图像识别找到绿色发送按钮")
-        return None
-        
-    except Exception as e:
-        logger.error(f"查找发送按钮出错: {e}")
-        return None
-
-def take_debug_screenshot(d, step_name):
-    """
-    调试专用截图函数
-    """
-    try:
-        screenshot_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots")
-        if not os.path.exists(screenshot_dir):
-            os.makedirs(screenshot_dir)
-        timestamp = time.strftime("%Y%m%d_%H%M%S")
-        filename = f"debug_{timestamp}_{step_name}.jpg"
-        save_path = os.path.join(screenshot_dir, filename)
-        d.screenshot(save_path)
-        logger.info(f"📸 [调试截图] {step_name} 已保存: {filename}")
-        return save_path
-    except Exception as e:
-        logger.error(f"截图失败 ({step_name}): {e}")
-        return None
-
-def perform_input_action(d, coords, text, auto_send=True):
-    """
-    点击坐标并输入文本
-    @param d: uiautomator2 device object
-    @param coords: (x, y) 坐标
-    @param text: 输入文本
-    @param auto_send: 是否自动点击发送，默认为 True
-    """
-    if d is None:
-        d = u2.connect()
-
-    # 1. 检查是否处于“语音模式”
-    if d(description="切换到键盘").exists:
-        logger.info("检测到处于语音模式，正在切换到键盘...")
-        d(description="切换到键盘").click()
-        time.sleep(1)
-
-    # 2. 优先尝试使用 uiautomator2 的原生控件查找 (更稳健)
-    try:
-        # 查找 EditText 控件
-        input_elem = d(className="android.widget.EditText")
-        
-        if input_elem.exists:
-            logger.info("发现原生输入框控件，尝试点击...")
-            
-            # 1. 截图：点击前
-            take_debug_screenshot(d, "native_01_before_click")
-            
-            # 点击策略
-            input_elem.click()
-            time.sleep(0.5)
-            
-            # 2. 截图：点击后 (预期键盘弹出)
-            take_debug_screenshot(d, "native_02_after_click_keyboard")
-            
-            logger.info(f"输入文本: {text}")
-            
-            # 尝试 set_text
-            input_elem.set_text(text)
-            time.sleep(0.5)
-            
-            # 检查文本是否输入成功，如果没有，尝试 send_keys
-            current_text = input_elem.get_text()
-            if not current_text or current_text != text:
-                logger.warning(f"set_text 似乎未生效 (当前: {current_text})，尝试 send_keys...")
-                d.send_keys(text)
-                
-            # 3. 截图：输入文本后
-            take_debug_screenshot(d, "native_03_after_text_input")
-
-            # 点击发送
-            if auto_send:
-                send_msg(d)
-            return True
-
-        else:
-            logger.warning("未找到输入框元素 (Native)，转入坐标点击模式...")
-
-    except Exception as e:
-        logger.warning(f"原生控件操作失败，降级为坐标点击: {e}")
-
-    # 3. 降级方案：使用坐标点击
-    if not coords:
-        logger.error("坐标无效，尝试使用动态兜底点击...")
-        # 动态寻找表情按钮左侧
-        emoji_btn = d(description="表情")
-        if emoji_btn.exists:
-            eb = emoji_btn.info['bounds']
-            # 点击表情按钮左侧 150 像素的位置
-            coords = (eb['left'] - 150, (eb['top'] + eb['bottom']) // 2)
-            logger.info(f"根据表情按钮位置计算点击坐标: {coords}")
-        else:
-            # 绝对兜底
-            width, height = d.window_size()
-            coords = (width // 2, int(height * 0.88))
-            logger.info(f"使用绝对兜底坐标: {coords}")
-
-    x, y = coords
     
     try:
-        # 1. 截图：点击输入框前
-        take_debug_screenshot(d, "coord_01_before_click_input")
+        # 调用 VLM
+        response = await vlm_kit.analyze_image(image_path, prompt)
+        json_str = vlm_kit.extract_json(response)
+        result_data = json.loads(json_str)
         
-        # 点击输入框
-        logger.info(f"点击坐标: ({x}, {y})")
-        d.click(x, y)
-        time.sleep(1.0)
-        
-        # 2. 截图：点击输入框后
-        take_debug_screenshot(d, "coord_02_after_click_input_keyboard")
-        
-        # 输入文本
-        logger.info(f"输入文本 (SendKeys): {text}")
-        d.send_keys(text)
-        
-        # 3. 截图：输入文本后
-        take_debug_screenshot(d, "coord_03_after_input_text")
-        
-        time.sleep(0.5)
-        
-        if auto_send:
-            send_msg(d)
+        # 获取图片尺寸进行坐标反归一化
+        try:
+            from PIL import Image
+            with Image.open(image_path) as img:
+                width, height = img.size
+                
+            # 定义反归一化函数
+            def denormalize(point):
+                if not point or len(point) != 2:
+                    return point
+                return [int(point[0] / 1000 * width), int(point[1] / 1000 * height)]
+                
+            # 反归一化 input_box
+            if result_data.get("input_box"):
+                result_data["input_box"] = denormalize(result_data["input_box"])
+                
+            # 反归一化 messages
+            if result_data.get("messages"):
+                for msg in result_data["messages"]:
+                    if msg.get("center"):
+                        msg["center"] = denormalize(msg["center"])
+                    if msg.get("coordinates"): # 兼容旧字段
+                        msg["coordinates"] = denormalize(msg["coordinates"])
+                        
+        except Exception as e:
+            logger.warning(f"坐标反归一化失败: {e}，将使用原始坐标")
             
-        return True
+        return result_data
     except Exception as e:
-        logger.error(f"坐标点击输入失败: {e}")
-        return False
+        logger.error(f"VLM Analysis Failed: {e}", exc_info=True)
+        return None
 
-def send_msg(d):
-    """统一的发送按钮点击逻辑"""
-    try:
-        # 1. 尝试回车
-        d.press("enter")
-        time.sleep(0.5)
-        
-        # 2. 尝试 Native 发送按钮
-        if d(text="发送").exists:
-            d(text="发送").click()
-            logger.info("已点击 '发送' 按钮 (Native Text)")
-            return True
-            
-        # 3. 尝试图像识别发送按钮
-        send_btn_coords = find_send_button(d)
-        if send_btn_coords:
-            sx, sy = send_btn_coords
-            d.click(sx, sy)
-            logger.info(f"已点击 '发送' 按钮 (Image Rec): {sx}, {sy}")
-            return True
-            
-        # 4. 盲点右下角
-        width, height = d.window_size()
-        fallback_x = int(width * 0.9)
-        fallback_y = int(height * 0.965)
-        logger.info(f"未识别到发送按钮，尝试盲点右下角: {fallback_x}, {fallback_y}")
-        d.click(fallback_x, fallback_y)
-        return True
-    except Exception as e:
-        logger.error(f"发送消息失败: {e}")
-        return False
-
-def find_red_dots(img):
+async def analyze_chat_image(image_path, output_path, device=None, target_name="对方"):
     """
-    检测图片中的小红点（未读消息标志）
+    使用 VLM 识别微信聊天截图中的对话内容、语音消息状态以及输入框位置
+    替代原本的 CV/OCR 方案
     """
-    try:
-        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
-        # 红色的两个范围，进一步放宽饱和度和亮度的限制，以防环境光影响
-        lower_red1 = np.array([0, 100, 80])
-        upper_red1 = np.array([10, 255, 255])
-        lower_red2 = np.array([160, 100, 80])
-        upper_red2 = np.array([180, 255, 255])
-        
-        mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
-        mask2 = cv2.inRange(hsv, lower_red2, upper_red2)
-        mask = cv2.add(mask1, mask2)
-        
-        # 形态学操作去除噪点
-        kernel = np.ones((3,3), np.uint8)
-        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
-        
-        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        red_dots = []
-        for cnt in contours:
-            area = cv2.contourArea(cnt)
-            # 小红点通常面积很小，放宽面积范围
-            if 5 < area < 2000:
-                x, y, w, h = cv2.boundingRect(cnt)
-                # 宽高比接近 1
-                if 0.4 < w/h < 2.5:
-                    red_dots.append({'x': x, 'y': y, 'w': w, 'h': h, 'center': (x + w//2, y + h//2)})
-        return red_dots
-    except Exception as e:
-        logger.error(f"检测红点出错: {e}")
-        return []
-
-def analyze_chat_image(image_path, output_path, target_name="对方"):
-    """
-    识别微信聊天截图中的头像并画框，识别对话内容
-    同时支持识别语音消息并触发长按转文字
-    """
-    logger.info(f"正在分析图片: {image_path}")
     
     # 语音识别标志
-    voice_msg_rect = None
-    convert_text_btn_rect = None
-    has_unread_dot = False
-
-    # 读取图片（支持中文路径）
+    should_trigger_convert = False
+    
     try:
-        img_data = np.fromfile(image_path, dtype=np.uint8)
-        img = cv2.imdecode(img_data, cv2.IMREAD_COLOR)
-    except Exception as e:
-        logger.error(f"读取图片失败: {e}")
-        return None
-
-    if img is None:
-        logger.error("图片读取为空")
-        return None
-    
-    # 备份一份干净的图片用于 OCR (避免识别到画上去的框)
-    img_clean = img.copy()
-    
-    height, width = img.shape[:2]
-    logger.info(f"图片尺寸: {width}x{height}")
-
-    # 1. 检测小红点
-    red_dots = find_red_dots(img)
-    if red_dots:
-        logger.info(f"检测到 {len(red_dots)} 个红色标记")
-
-    # 2. 获取输入框位置作为过滤参考
-    input_center, input_rect = find_input_box_center(image_path)
-    # 输入框顶部坐标，如果没有识别到，默认底部 5% 区域过滤
-    input_top_y = input_rect[1] if input_rect else height * 0.95
-
-    # 3. 识别头像和对话 (OCR 过程增加日志)
-    logger.info("开始进行文字识别 (EasyOCR)...")
-    reader = get_easyocr_reader()
-    ocr_results = reader.read_text(img_clean)
-    logger.info(f"OCR 识别完成，共发现 {len(ocr_results)} 个文本区域")
-    
-    # 将 OCR 结果整理并按 Y 坐标排序
-    sorted_ocr = []
-    for (bbox, text, prob) in ocr_results:
-        if prob < 0.2: continue
-        text = text.strip()
-        (tl, tr, br, bl) = bbox
-        tx, ty = int(tl[0]), int(tl[1])
-        tw, th = int(br[0] - tl[0]), int(br[1] - tl[1])
+        result_data = await get_vlm_analysis(image_path)
         
-        # --- 这里的过滤逻辑进行了重大调整 ---
-        # 1. 过滤顶部状态栏 (80px)
-        # 2. 只有当文字完全位于输入框内部时才过滤。
-        # 注意：语音转文字的结果可能紧贴输入框，所以这里逻辑要极度小心
-        is_in_input = False
-        if input_rect:
-            ix, iy, iw, ih = input_rect
-            # 如果文字的中心点在输入框矩形内，则认为是输入框内容
-            if (ix < tx + tw/2 < ix + iw) and (iy < ty + th/2 < iy + ih):
-                # 额外检查：如果这个位置上方刚刚有一个语音消息（y轴相近），则不认为是输入框干扰
-                is_in_input = True
-        
-        if ty < 80 or (is_in_input and ty > height * 0.95): # 将过滤阈值进一步提高到 0.95，几乎只过滤输入框最底部的文字
-            logger.info(f"跳过干扰项: y={ty}, text='{text}' (原因: {'状态栏' if ty < 80 else '输入框深度区域'})")
-            continue
-        
-        # 预处理文本：移除所有非数字、非引号、非字母的干扰字符，再进行语音匹配
-        clean_text = re.sub(r'[^\d"a-zA-Z\u4e00-\u9fa5]', '', text)
-        is_voice = bool(re.search(r'\d+"$', clean_text) or "小" in clean_text and "\"" in clean_text)
-        
-        # 调试日志：输出所有保留的行
-        logger.info(f"有效 OCR 行: y={ty}, text='{text}', is_voice={is_voice}")
-        
-        sorted_ocr.append({'x': tx, 'y': ty, 'w': tw, 'h': th, 'text': text, 'is_voice': is_voice})
-    
-    sorted_ocr.sort(key=lambda x: x['y'])
+        if not result_data:
+            return [], None
 
-    # 识别头像位置
-    # --- 轮廓查找头像逻辑 ---
-    gray = cv2.cvtColor(img_clean, cv2.COLOR_BGR2GRAY)
-    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)
-    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    avatars = []
-    for contour in contours:
-        x, y, w, h = cv2.boundingRect(contour)
-        if 0.8 < float(w)/h < 1.2 and 50 < w < 150:
-            # 头像过滤逻辑同步放宽：只要不在最顶端，且不在输入框的深处，都保留
-            if y < 80: continue
-            if input_rect and y > input_rect[1] + 50: continue # 只有深陷输入框内部的才过滤
-            avatars.append((x, y, w, h))
-            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 255), 2)
-    avatars.sort(key=lambda a: a[1])
-
-    # 处理对话逻辑
-    dialogue_log = []
-    voice_msg_rect = None
-    convert_text_btn_rect = None
-    
-    # 遍历 OCR 结果，合并语音和文字
-    i = 0
-    while i < len(sorted_ocr):
-        item = sorted_ocr[i]
-        # 查找发言人
-        speaker = "未知"
-        for (ax, ay, aw, ah) in avatars:
-            if abs(item['y'] - ay) < 100:
-                speaker = target_name if ax < item['x'] else "我"
-                break
-        
-        # 处理语音转换逻辑
-        # 只要是语音格式，且在左侧（或者发言人是对方），就考虑转换
-        is_target_voice = item['is_voice'] and (speaker == target_name or (speaker == "未知" and item['x'] < width * 0.4))
-        
-        if is_target_voice:
-            # --- 遵循用户定义的明显规则 ---
-            # 检查下方是否有文字。如果有文字且距离很近，说明已经识别过了
-            has_text_below = False
-            if i + 1 < len(sorted_ocr):
-                # 检查接下来的几行（最多3行），看是否有非语音文字
-                # 微信转文字后，可能会有 1. 转换出的文字 2. "收起"按钮 3. "原文"按钮
-                for j in range(i + 1, min(i + 4, len(sorted_ocr))):
-                    next_item = sorted_ocr[j]
-                    # 判定条件：1. 下一条不是语音 2. Y轴距离在合理范围 (300px内，转文字可能很长) 3. X轴位置相近
-                    y_dist = next_item['y'] - item['y']
-                    x_dist = abs(next_item['x'] - item['x'])
-                    
-                    if not next_item['is_voice'] and 0 < y_dist < 300:
-                        if x_dist < 250 or "收起" in next_item['text'] or "原文" in next_item['text']:
-                            has_text_below = True
-                            # 如果是正常的文字内容（不是功能按钮），记录下来
-                            if "收起" not in next_item['text'] and "原文" not in next_item['text']:
-                                dialogue_log.append(f"{speaker}: [语音] {next_item['text']}")
-                                logger.info(f"语音 {item['text']} 下方已存在识别文字 '{next_item['text']}'，跳过转换操作")
-                            else:
-                                dialogue_log.append(f"{speaker}: [语音] (已转换)")
-                                logger.info(f"检测到转换控制按钮 '{next_item['text']}'，认为已完成转换")
-                            
-                            # 既然已经转换了，如果有待点击的按钮，也应该清除掉，防止误点关闭
-                            convert_text_btn_rect = None 
-                            i = j # 跳过已处理的行
-                            break
-                if has_text_below:
-                    i += 1
-                    continue
-            
-            # 如果下方没有文字，这才是真正需要点击转文字的对象
-            if not has_text_below:
-                # 只要没识别，就锁定它。如果有多个没识别的，锁定最后一个（最下面的）
-                if not voice_msg_rect or item['y'] > voice_msg_rect['y']:
-                    voice_msg_rect = item
-                dialogue_log.append(f"{speaker}: [语音] {item['text']} (待转换)")
-                logger.info(f"发现待转换语音: {item['text']} (y={item['y']})")
-        elif "转文字" in item['text']:
-            if not convert_text_btn_rect or item['y'] > convert_text_btn_rect['y']:
-                convert_text_btn_rect = item
-            # "转文字" 按钮不计入对话内容
-        else:
-            dialogue_log.append(f"{speaker}: {item['text']}")
-        
-        i += 1
-
-    # 检查是否需要触发转文字
-    has_unread_dot = False
-    # 检查转文字按钮或语音消息旁的红点
-    target_item = convert_text_btn_rect if convert_text_btn_rect else voice_msg_rect
-    if target_item:
-        for dot in red_dots:
-            dist_x = abs(dot['center'][0] - (target_item['x'] + target_item['w']))
-            dist_y = abs(dot['center'][1] - (target_item['y'] + target_item['h']//2))
-            # 红点可能在左侧 (转文字按钮) 或右侧 (语音)
-            if (abs(dot['center'][0] - target_item['x']) < 150 or dist_x < 150) and dist_y < 100:
-                has_unread_dot = True
-                break
-
-    # 兜底：如果是最后一条是语音且没文字，强制转换
-    should_trigger_convert = has_unread_dot
-    if not should_trigger_convert and voice_msg_rect:
-        # 只要最后一条消息包含了这个语音的内容（且它是待转换状态），就触发
-        # 修正原有的 endswith bug: 因为 dialogue_log 里的语音被标记了 "(待转换)"
-        if dialogue_log and (voice_msg_rect['text'] in dialogue_log[-1]) and "(待转换)" in dialogue_log[-1]:
-            logger.info(f"最后一条消息是语音({voice_msg_rect['text']})且未转换，强制触发转文字")
-            should_trigger_convert = True
-
-    if should_trigger_convert:
         try:
-            d = u2.connect()
-            if convert_text_btn_rect:
-                tx = convert_text_btn_rect['x'] + convert_text_btn_rect['w'] // 2
-                ty = convert_text_btn_rect['y'] + convert_text_btn_rect['h'] // 2
-                logger.info(f"点击 '转文字' 按钮: ({tx}, {ty})")
-                d.click(tx, ty)
-                time.sleep(2.0)
-                return "VOICE_CONVERTING"
-            elif voice_msg_rect:
-                vx = voice_msg_rect['x'] + voice_msg_rect['w'] // 2
-                vy = voice_msg_rect['y'] + voice_msg_rect['h'] // 2
+            # 检查是否为聊天界面
+            is_chat = result_data.get("is_chat_interface", False)
+            if not is_chat:
+                logger.warning("VLM 判断当前不是微信聊天界面")
+                return None, None
+                
+            if isinstance(result_data, list):
+                # 兼容旧格式
+                messages = result_data
+                input_field_coordinates = None
+            else:
+                messages = result_data.get("messages", [])
+                input_field_coordinates = result_data.get("input_box") # input_box
+
+        except Exception as e:
+            logger.error(f"解析 VLM 结果失败: {e}")
+            return [], None
+
+        dialogue_log = []
+        unconverted_voices = []
+
+        # 处理识别结果
+        for msg in messages:
+            sender = msg.get('sender', '未知')
+            msg_type = msg.get('type', 'other')
+            content = msg.get('content', '')
+            coords = msg.get('center', [0, 0]) # center
+            status = msg.get('status', 'unconverted')
+            is_converted = (status == "converted")
+            
+            # 记录对话日志
+            if msg_type == 'voice':
+                if is_converted:
+                     dialogue_log.append(f"{sender}: [语音] {content} (已转换)")
+                else:
+                     dialogue_log.append(f"{sender}: [语音] (待转换)")
+                     # 将 center 转换为 coordinates 供后续使用
+                     msg['coordinates'] = coords 
+                     unconverted_voices.append(msg)
+            elif msg_type == 'text':
+                dialogue_log.append(f"{sender}: {content}")
+            
+            logger.info(f"VLM 识别: {sender} [{msg_type}] {content} (Converted: {is_converted})")
+
+        # 处理未转换的语音消息
+        if unconverted_voices:
+            logger.info(f"发现 {len(unconverted_voices)} 条未转换的语音消息，将仅处理最后一条...")
+            # 仅保留最后一条语音消息进行处理
+            unconverted_voices = [unconverted_voices[-1]]
+            
+            # 使用传入的 device 或创建新连接
+            d = device if device else u2.connect()
+            
+            for voice in unconverted_voices:
+                vx, vy = voice['coordinates']
                 logger.info(f"长按语音消息: ({vx}, {vy})")
                 d.long_click(vx, vy, 1.5)
                 time.sleep(1.0)
-                if d(text="转文字").exists:
-                    d(text="转文字").click()
+                
+                # 查找“转文字” (使用 OCR)
+                menu_shot_path = os.path.join(os.path.dirname(image_path), "temp_menu_shot.jpg")
+                d.screenshot(menu_shot_path)
+                
+                # OCR 识别
+                ocr_results = ocr_kit.read_text(menu_shot_path)
+                convert_btn_center = None
+                
+                for bbox, text, conf in ocr_results:
+                    if "转文字" in text or "转换为文字" in text:
+                        # bbox is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+                        # Calculate center
+                        c_x = int((bbox[0][0] + bbox[2][0]) / 2)
+                        c_y = int((bbox[0][1] + bbox[2][1]) / 2)
+                        convert_btn_center = (c_x, c_y)
+                        break
+                        
+                if convert_btn_center:
+                    logger.info(f"OCR 找到 '转文字' 按钮: {convert_btn_center}")
+                    d.click(convert_btn_center[0], convert_btn_center[1])
+                    should_trigger_convert = True
+                    
+                    # 动态等待: 60s语音约需10s转换，比例约 1/6
+                    duration_str = voice.get('content', '0').replace('"', '').strip()
+                    try:
+                        duration = int(duration_str)
+                    except:
+                        duration = 10 # 默认值
+                        
+                    wait_seconds = max(2, duration / 5.0) # 稍微多等一点，用 /5.0
+                    logger.info(f"语音时长 {duration}s，预计等待转换 {wait_seconds:.1f}s...")
+                    time.sleep(wait_seconds)
+                    
                 else:
-                    # 截图找菜单
-                    temp_menu_path = take_debug_screenshot(d, "voice_menu_retry")
-                    menu_results = reader.read_text(temp_menu_path)
-                    for (m_bbox, m_text, m_prob) in menu_results:
-                        if "转文字" in m_text:
-                            m_tl, _, m_br, _ = m_bbox
-                            d.click(int(m_tl[0] + (m_br[0]-m_tl[0])//2), int(m_tl[1] + (m_br[1]-m_tl[1])//2))
-                            break
-                time.sleep(2.0)
-                return "VOICE_CONVERTING"
-        except Exception as e:
-            logger.error(f"转换操作失败: {e}")
+                    logger.warning("OCR 未找到 '转文字' 菜单项")
+                    # 点击空白处关闭菜单，避免遮挡
+                    d.click(vx + 200, vy) 
+
+            if should_trigger_convert:
+                # 转换完成后稍微多等一下，确保 UI 刷新
+                time.sleep(1.0)
+                return "VOICE_CONVERTING", input_field_coordinates
+
+        return dialogue_log, input_field_coordinates
 
-    # 保存结果图
-    if input_rect:
-        ix, iy, iw, ih = input_rect
-        cv2.rectangle(img, (ix, iy), (ix + iw, iy + ih), (255, 0, 0), 2)
-    try:
-        is_success, buffer = cv2.imencode(".jpg", img)
-        if is_success:
-            buffer.tofile(output_path)
     except Exception as e:
-        logger.error(f"保存失败: {e}")
+        logger.error(f"VLM 分析失败: {e}", exc_info=True)
+        return [], None
+
+
+def clean_screenshots_dir():
+    """清理截图目录"""
+    screenshot_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Screenshots")
+    if not os.path.exists(screenshot_dir):
+        os.makedirs(screenshot_dir)
+        return
+    
+    for f in os.listdir(screenshot_dir):
+        if f.lower().endswith(('.jpg', '.png', '.jpeg')):
+            try:
+                os.remove(os.path.join(screenshot_dir, f))
+            except Exception as e:
+                logger.warning(f"Failed to delete {f}: {e}")
+
+def is_in_chat_interface(d):
+    """
+    检查是否在微信聊天界面
+    """
+    try:
+        # 1. 底部语音/键盘切换按钮
+        if d(description="切换到语音").exists or d(description="切换到键盘").exists:
+            return True
+        # 2. 底部输入框
+        if d(className="android.widget.EditText").exists:
+            return True
+        # 3. 底部“按住说话”按钮
+        if d(text="按住说话").exists:
+            return True
+        # 4. 右上角更多按钮
+        if d(description="聊天信息").exists:
+            return True
+    except Exception as e:
+        logger.warning(f"is_in_chat_interface check failed: {e}")
+        
+    return False
+
+def find_input_box_center(image_path):
+    """
+    寻找输入框中心坐标 (兜底策略)
+    优先使用几何特征 (底部 88% 处)
+    """
+    try:
+        if not os.path.exists(image_path):
+            return (540, 2100), None
+            
+        img = cv2.imread(image_path)
+        if img is None:
+             return (540, 2100), None
+             
+        h, w = img.shape[:2]
+        
+        # 策略：直接返回屏幕底部 88% 处的中心点
+        center_x = int(w * 0.5)
+        center_y = int(h * 0.88)
+        
+        logger.info(f"find_input_box_center fallback: ({center_x}, {center_y})")
+        return (center_x, center_y), None
+        
+    except Exception as e:
+        logger.error(f"find_input_box_center error: {e}")
+        return (540, 2100), None
+
+def perform_input_action(d, center_point, text, auto_send=True):
+    """
+    执行输入操作
+    """
+    try:
+        # 1. 尝试找到原生输入框并输入
+        edit_text = d(className="android.widget.EditText")
+        input_success = False
+        
+        if edit_text.exists:
+            logger.info("Found native EditText, using set_text")
+            try:
+                edit_text.click()
+                time.sleep(0.5)
+                edit_text.set_text(text)
+                input_success = True
+            except Exception as e:
+                logger.warning(f"Native input failed: {e}")
+        
+        # 2. 如果原生输入失败，使用坐标点击 + 粘贴/输入
+        if not input_success:
+            cx, cy = center_point
+            logger.info(f"Using coordinate input: {center_point}")
+            d.click(cx, cy)
+            time.sleep(1.0)
+            
+            try:
+                d.send_keys(text)
+            except Exception:
+                logger.warning("send_keys failed, trying set_clipboard")
+                d.set_clipboard(text)
+                d.click(cx, cy)
+                time.sleep(0.5)
+                # 尝试粘贴
+                d.press("paste")
+        
+        time.sleep(1.0)
+        
+        # 3. 发送
+        if auto_send:
+            if d(text="发送").exists:
+                d(text="发送").click()
+                logger.info("Clicked '发送'")
+            else:
+                d.press("enter")
+                logger.info("Pressed Enter")
+                
+    except Exception as e:
+        logger.error(f"perform_input_action error: {e}")
 
-    return dialogue_log
 
diff --git a/WeiXin/__pycache__/WxUtil.cpython-310.pyc b/WeiXin/__pycache__/WxUtil.cpython-310.pyc
index f3562b6..fc36058 100644
Binary files a/WeiXin/__pycache__/WxUtil.cpython-310.pyc and b/WeiXin/__pycache__/WxUtil.cpython-310.pyc differ