'commit'

2026-01-25 14:40:30 +08:00
parent b4d0e56037
commit 35654ec166
10 changed files with 181 additions and 28 deletions
--- a/WeiXin/Screenshots/t6_debug_result.jpg
+++ b/WeiXin/Screenshots/t6_debug_result.jpg
--- a/WeiXin/Screenshots/t6_menu_shot_cancel.jpg
+++ b/WeiXin/Screenshots/t6_menu_shot_cancel.jpg
--- a/WeiXin/Screenshots/t6_menu_shot_convert.jpg
+++ b/WeiXin/Screenshots/t6_menu_shot_convert.jpg
--- a/WeiXin/Screenshots/t6_monitor_temp.jpg
+++ b/WeiXin/Screenshots/t6_monitor_temp.jpg
--- a/WeiXin/Screenshots/t6_ocr_shot.jpg
+++ b/WeiXin/Screenshots/t6_ocr_shot.jpg
--- a/WeiXin/T6_AutoChatMonitor.py
+++ b/WeiXin/T6_AutoChatMonitor.py
@@ -14,7 +14,7 @@ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 if project_root not in sys.path:
    sys.path.append(project_root)

-from WeiXin.WxUtil import find_input_box_center, perform_input_action, get_vlm_analysis, clean_screenshots_dir, is_in_chat_interface, get_vlm_json, find_template_match
+from WeiXin.WxUtil import find_input_box_center, perform_input_action, get_vlm_analysis, clean_screenshots_dir, is_in_chat_interface, get_vlm_json, find_template_match, find_all_template_matches
 from Util.LlmUtil import get_llm_response
 from Util.EasyOcrKit import EasyOcrKit

@@ -80,7 +80,8 @@ class ChatBot:
            "如果涉及到校区信息，必须且只能使用以下真实数据：\n"
            "- 单位：长春市少惠林作文素养培养中心\n"
            "- 地址：南环城路与临河街交汇，TOUCH12街3楼325号\n"
-            "- 联系人：小张老师（电话：18686619970）"
+            "- 联系人：小张老师（电话：18686619970）\n"
+            "- 每学期开学招收小学三年级至六年级，初中七年级的学生入学，其它年段不招生。\n"
        )

    async def get_reply(self, history_text, is_proactive=False):
@@ -98,8 +99,9 @@ class ChatBot:
                f"【教师人设】：{self.persona}\n\n"
                f"【近期聊天记录】：\n{history_text}\n\n"
                "【任务要求】：\n"
-                "请作为大张老师回复家长。针对家长的具体问题或话语进行回复。"
-                "严禁发散，严禁编造家长没说过的情况。如果不清楚家长的意图，就温柔询问。"
+                "请作为大张老师回复家长。**必须且只能针对聊天记录中的最后一条消息进行回复！**\n"
+                "之前的聊天记录仅供参考上下文，如果之前的问题已经回答过，绝对不要重复回答。\n"
+                "严禁发散，严禁编造家长没说过的情况。如果不清楚家长的意图，就温柔询问。\n"
                "字数严格控制在 50 字以内。直接输出回复正文。"
            )
        
@@ -119,8 +121,8 @@ class ChatBot:
        logger.info(f"🎤 开始处理语音消息: {content}, 坐标: ({vx}, {vy})")
        
        # 1. 长按语音消息
-        self.d.long_click(vx, vy, 1.5)
-        time.sleep(1.0)
+        self.d.long_click(vx, vy, 0.6)
+        time.sleep(0.3)
        
        # 2. CV 模板匹配寻找 "转文字" 按钮
        menu_shot_path = os.path.join(self.screenshot_dir, "t6_menu_shot_convert.jpg")
@@ -212,8 +214,8 @@ class ChatBot:
        # 或者，我们可以点击转换出来的文字区域？
        # 让我们尝试点击原来的坐标。
        
-        self.d.long_click(vx, vy, 1.5)
-        time.sleep(1.0)
+        self.d.long_click(vx, vy, 0.6)
+        time.sleep(0.3)
        
        # 6. CV 模板匹配寻找 "取消转文字" 按钮
        menu_shot_path_cancel = os.path.join(self.screenshot_dir, "t6_menu_shot_cancel.jpg")
@@ -237,6 +239,8 @@ class ChatBot:
        # 0. 清除旧截图
        clean_screenshots_dir()
        
+        last_screen_md5 = None
+        
        while True:
            try:
                logger.info("🔍 正在扫描当前界面内容...")
@@ -246,6 +250,18 @@ class ChatBot:
                logger.info(f"📸 正在截取屏幕... ({datetime.now().strftime('%H:%M:%S')})")
                self.d.screenshot(tmp_shot)
                
+                # 计算 MD5 并去重
+                import hashlib
+                with open(tmp_shot, 'rb') as f:
+                    current_md5 = hashlib.md5(f.read()).hexdigest()
+                    
+                if last_screen_md5 and current_md5 == last_screen_md5:
+                    logger.info("😴 屏幕内容未变，跳过本次循环。")
+                    await asyncio.sleep(CHECK_INTERVAL)
+                    continue
+                    
+                last_screen_md5 = current_md5
+                
                # 2. VLM 分析
                logger.info("🧠 正在调用 VLM 分析图片...")
                result_data = await get_vlm_analysis(tmp_shot)
@@ -257,7 +273,51 @@ class ChatBot:

                # 3. 解析数据构建 dialogue_log
                messages = result_data.get("messages", [])
+                
+                # 🚨 关键修正：按 Y 坐标对消息进行排序，确保时间顺序正确
+                # VLM 返回的顺序可能不准，必须强制按屏幕位置（从上到下）排序
+                messages.sort(key=lambda m: (m.get("center") or m.get("coordinates") or [0, 0])[1])
+                
                input_center = result_data.get("input_box")
+
+                # --- 🔴 红点补救策略 (Red Point Correction) ---
+                # VLM 有时会漏掉红点，我们使用 CV 模板匹配来修正
+                red_point_template = r"d:\dsWork\aiData\WeiXin\Templates\red_point.jpg"
+                red_points = find_all_template_matches(tmp_shot, red_point_template, threshold=0.8)
+                
+                if red_points:
+                    logger.info(f"🔴 CV 检测到 {len(red_points)} 个红点，正在修正语音消息状态...")
+                    for rp in red_points:
+                        rx, ry = rp
+                        # 遍历所有消息，找到距离该红点最近的【语音消息】
+                        # 规则：红点通常在语音消息的右侧，Y轴差异不大
+                        best_match_msg = None
+                        min_dist = 9999
+                        
+                        for msg in messages:
+                            if msg.get("type") == "voice":
+                                coords = msg.get("center") or msg.get("coordinates")
+                                if coords:
+                                    mx, my = coords
+                                    # 检查 Y 轴距离 (红点应该和语音气泡在同一行，容差 50px)
+                                    if abs(my - ry) < 50:
+                                        # 检查 X 轴关系 (红点在语音气泡右侧)
+                                        if rx > mx: 
+                                            dist = ((rx - mx)**2 + (ry - my)**2)**0.5
+                                            if dist < min_dist:
+                                                min_dist = dist
+                                                best_match_msg = msg
+                        
+                        if best_match_msg:
+                            # 只有当距离合理（比如小于 300px，视气泡长度而定，但红点一般紧挨着）
+                            # 考虑到长语音气泡可能很长，中心点在中间，红点在最右边，距离可能较远
+                            # 所以主要依赖 Y 轴对齐和 X 轴方向。
+                            # 这里直接标记
+                            if not best_match_msg.get("is_unread", False):
+                                best_match_msg["is_unread"] = True
+                                logger.info(f"🔴 修正：标记语音消息 {best_match_msg.get('content')} 为未读 (红点坐标: {rp})")
+                # ---------------------------------------------
+
                
                # --- Debug Visualization ---
                try:
@@ -316,7 +376,9 @@ class ChatBot:
                        coords = msg.get("center") or msg.get("coordinates")
                        if coords:
                            msg["coordinates"] = coords
-                            voice_messages.append(msg)
+                            # 只处理“对方”的语音消息，忽略“我”发送的语音
+                            if sender_name != "我":
+                                voice_messages.append(msg)
                            
                        # 在日志中暂时标记为 [语音]，稍后如果处理了会更新
                        # 但为了日志完整性，我们这里先占位
@@ -348,10 +410,9 @@ class ChatBot:
                        # 无论是否未读，都处理
                        text = await self.process_single_voice(v_msg, next_msg, input_y)
                        if text:
-                            # 更新日志中的内容 (这比较复杂，因为 log 是 append 的)
-                            # 简单起见，我们只记录最后一条处理的内容用于回复判断
-                            # 但为了上下文准确，应该更新 dialogue_log
-                            # 这里简化处理：如果是最后一条，我们记录下来
+                            # 直接更新 dialogue_log 对应的条目
+                            dialogue_log[idx] = dialogue_log[idx].replace("[语音]", f"[语音转文字: {text}]")
+                            
                            if v_msg == voice_messages[-1]:
                                processed_voice_content = text
                    
@@ -371,23 +432,23 @@ class ChatBot:
                            except ValueError:
                                next_msg = None
                                
-                            processed_voice_content = await self.process_single_voice(last_voice, next_msg, input_y)
+                            text = await self.process_single_voice(last_voice, next_msg, input_y)
+                            if text:
+                                # 直接更新 dialogue_log 对应的条目
+                                dialogue_log[idx] = dialogue_log[idx].replace("[语音]", f"[语音转文字: {text}]")
+                                processed_voice_content = text
                        else:
-                            logger.info("⚪ 最后一条语音消息已读，跳过处理。")
+                            # 增加更多调试信息，帮助定位为何跳过
+                            sender = last_voice.get("sender", "未知")
+                            content = last_voice.get("content", "")
+                            coords = last_voice.get("coordinates", [])
+                            logger.info(f"⚪ 最后一条语音消息已读，跳过处理。[{sender}, {content}, {coords}]")
+

                # 5. LLM 回复逻辑
                # 只有当有新的语音被处理并识别出文字，或者有新的文本消息时才回复
-                # 这里简化：如果 processed_voice_content 存在，说明我们刚刚处理了一个语音，需要回复
-                # 或者，我们可以检查是否是最后一条消息是对方发的
+                # 既然 dialogue_log 已经更新，我们直接用 history_text
                
-                # 重新构建 history_text，如果有处理出的语音文本，替换掉最后的 [语音]
-                if processed_voice_content:
-                    # 找到最后一条包含 [语音] 的日志并替换
-                    for i in range(len(dialogue_log) - 1, -1, -1):
-                        if "[语音]" in dialogue_log[i]:
-                            dialogue_log[i] = dialogue_log[i].replace("[语音]", f"[语音转文字: {processed_voice_content}]")
-                            break
-                            
                history_text = "\n".join(dialogue_log)
                
                # 判断是否需要回复：
--- a/WeiXin/Templates/red_point.jpg
+++ b/WeiXin/Templates/red_point.jpg
--- a/WeiXin/Templates/wen_zi_input.jpg
+++ b/WeiXin/Templates/wen_zi_input.jpg
--- a/WeiXin/WxUtil.py
+++ b/WeiXin/WxUtil.py
@@ -82,6 +82,7 @@ async def get_vlm_analysis(image_path):

    【核心任务】
    识别图中的【语音消息气泡】和【文本消息气泡】，并区分【发送者】。
+    ⚠️ **特别注意**：必须识别屏幕上**所有**的消息，特别是位于**屏幕最底部**的消息，哪怕只有一部分，也要识别！

    【重要判别规则】
    1. 👤 **发送者 (Sender)**：
@@ -95,13 +96,19 @@ async def get_vlm_analysis(image_path):
           - **极短 (1"-2")**：气泡非常短，形状接近一个小正方形。
           - **极长 (60")**：气泡很长，宽度接近屏幕的一半。
         - **内容**：气泡内**只有一个**表示时长的数字（如 `8"`）和一个声波图标。
+         - **辅助文字**：语音气泡右侧可能会有灰色的“转文字”或“取消转文字”字样，**请忽略这些文字**，依然将该气泡识别为语音消息！
       - **绝对排除**：凡是包含汉字、长句子的气泡，**统统不是**语音消息。
    
    3. 📝 **文本消息 (Text)**：
       - **视觉特征**：气泡内包含汉字、标点符号、表情等文本内容。

-    4. 🔴 **未读状态 (Unread)**：
-       - **特征**：语音气泡右上角或附近有一个明显的**小红点**。
+    4. 🔴 **未读状态 (Unread) - 极度重要！**：
+       - **特征**：语音气泡的右上角（或紧邻右侧）有一个明显的**红色圆形小点**。
+       - **判别**：
+         - 只要看到红色小点，`is_unread` 必须为 **true**。
+         - 如果没有红色小点，`is_unread` 为 false。
+       - **注意**：红点可能很小，请仔细观察！这是判断是否处理的关键依据。即使红点旁边有灰色的“转文字”字样，只要有红点，就是未读！
+

    【坐标系统】
    **必须使用 [0-1000] 的归一化坐标系。**
@@ -133,7 +140,7 @@ async def get_vlm_analysis(image_path):
    注意：
    1. 坐标 `center` 和 `input_box` 必须是 [0-1000] 的归一化坐标。
    2. `status` 判断：如果语音气泡的正下方紧挨着一条文本消息（通常是转换出的文字），则为 `converted`，否则为 `unconverted`。
-    3. `is_unread` 判断：如果有红点则为 true，否则为 false (仅针对语音消息)。
+    3. `is_unread` 判断：务必准确识别红点！如果有红点则为 true。
    4. 请按从上到下的顺序输出所有消息。
    """
    
@@ -392,11 +399,96 @@ def find_template_match(screen_path, template_path, threshold=0.8):
        logger.error(f"Template matching failed: {e}")
        return None

+def find_all_template_matches(screen_path, template_path, threshold=0.8):
+    """
+    使用 OpenCV 模板匹配寻找**所有**符合条件的坐标
+    """
+    try:
+        if not os.path.exists(template_path):
+            logger.error(f"Template file not found: {template_path}")
+            return []
+
+        img = cv2.imread(screen_path)
+        template = cv2.imread(template_path)
+        if img is None or template is None:
+            return []
+
+        h, w = template.shape[:2]
+        res = cv2.matchTemplate(img, template, cv2.TM_CCOEFF_NORMED)
+        
+        # 找到所有大于阈值的点
+        loc = np.where(res >= threshold)
+        
+        points = []
+        for pt in zip(*loc[::-1]):  # Switch collumns and rows
+            center_x = pt[0] + w // 2
+            center_y = pt[1] + h // 2
+            points.append((center_x, center_y))
+            
+        # 简单的去重（非极大值抑制的简化版，合并相近的点）
+        # 这里假设红点不会重叠，暂时直接返回，或者做一个简单的聚类
+        # 实际应用中，matchTemplate 对同一个目标周围可能会有多个连续的匹配点
+        # 我们需要合并它们
+        
+        unique_points = []
+        for p in points:
+            is_close = False
+            for up in unique_points:
+                if abs(p[0] - up[0]) < 10 and abs(p[1] - up[1]) < 10:
+                    is_close = True
+                    break
+            if not is_close:
+                unique_points.append(p)
+                
+        if unique_points:
+            logger.info(f"Found {len(unique_points)} matches for {os.path.basename(template_path)}")
+            
+        return unique_points
+
+    except Exception as e:
+        logger.error(f"find_all_template_matches failed: {e}")
+        return []
+
 def perform_input_action(d, center_point, text, auto_send=True):
    """
    执行输入操作
    """
    try:
+        # --- 新增逻辑：确保处于文字输入模式 ---
+        logger.info("正在检查输入模式...")
+        tmp_check_shot = os.path.join(os.path.dirname(os.path.abspath(__file__)), "temp_input_check.jpg")
+        d.screenshot(tmp_check_shot)
+        
+        wen_zi_template = r"d:\dsWork\aiData\WeiXin\Templates\wen_zi_input.jpg"
+        input_text_template = r"d:\dsWork\aiData\WeiXin\Templates\input_text.jpg"
+        
+        # 1. 检查是否存在 '切换到文字' 图标 (表示当前是语音模式)
+        # 注意：这里假设 wen_zi_input.jpg 是那个“键盘”图标
+        wen_zi_pos = find_template_match(tmp_check_shot, wen_zi_template, threshold=0.8)
+        
+        if wen_zi_pos:
+            logger.info(f"检测到语音模式 (找到切换文字图标: {wen_zi_pos})，点击切换...")
+            d.click(wen_zi_pos[0], wen_zi_pos[1])
+            time.sleep(1.0) # 等待 UI 切换
+        else:
+            # 2. 如果没找到切换图标，假设是文字模式，尝试点击输入区域标识
+            logger.info("未检测到语音模式切换图标，尝试寻找文字输入区域...")
+            input_text_pos = find_template_match(tmp_check_shot, input_text_template, threshold=0.8)
+            if input_text_pos:
+                 logger.info(f"找到文字输入区域标识 (input_text.jpg): {input_text_pos}，点击激活...")
+                 d.click(input_text_pos[0], input_text_pos[1])
+                 time.sleep(0.5)
+            else:
+                 logger.info("未找到特定的输入区域标识，将使用默认坐标或控件查找。")
+        
+        # 清理临时文件
+        if os.path.exists(tmp_check_shot):
+            try:
+                os.remove(tmp_check_shot)
+            except:
+                pass
+        # --- 新增逻辑结束 ---
+
        # 1. 尝试找到原生输入框并输入
        edit_text = d(className="android.widget.EditText")
        input_success = False
--- a/WeiXin/pycache/WxUtil.cpython-310.pyc
+++ b/WeiXin/pycache/WxUtil.cpython-310.pyc