diff --git a/.idea/vcs.xml b/.idea/vcs.xml index 3f1db62..af70679 100644 --- a/.idea/vcs.xml +++ b/.idea/vcs.xml @@ -3,5 +3,6 @@ + \ No newline at end of file diff --git a/Apps/XinDianTu/Config/Setting.py b/Apps/XinDianTu/Config/Setting.py new file mode 100644 index 0000000..5b369e2 --- /dev/null +++ b/Apps/XinDianTu/Config/Setting.py @@ -0,0 +1,31 @@ + +# 采集配置 +# 滑动距离比例 (0.1 ~ 0.9),数值越大滑动幅度越大,建议 0.3-0.5 以避免错过中间内容 +SCROLL_DISTANCE_RATIO = 0.3 +# 最大滑动/翻页次数,达到此次数后停止采集 +MAX_SCROLLS = 100 +# 默认抓取半径(公里),当检测到场站距离超过此值时停止采集 +MAX_CRAWL_DISTANCE = 50 +# 场站去重过期时间(秒),在此时间内重复出现的场站不会再次点击进入详情页 +REDIS_STATION_EXPIRE = 120 + +# 数据库数据保留时长(天),超过此时长的历史数据(is_current=0)将被删除 +DATA_RETENTION_DAYS = 365 + +# 等待时间配置 (秒) +# 点击进入详情页后等待加载的时间 +WAIT_DETAIL_PAGE_LOAD = 2.5 +# 从详情页返回列表页后等待页面刷新的时间 +WAIT_BACK_TO_LIST = 1.5 +# 执行滑动操作后等待页面内容加载和稳定的时间 +WAIT_AFTER_SCROLL = 3.0 + +# 坐标计算与安全防护 +# 屏幕顶部安全排除比例 (0.0~1.0),此比例区域内不进行点击(避开状态栏、筛选栏等) +SAFE_EXCLUDE_RATIO = 0.20 +# 屏幕底部安全排除比例 (0.0~1.0),此比例区域内不进行点击(避开底部导航栏、功能按钮等) +BOTTOM_SAFE_EXCLUDE_RATIO = 0.1 +# 默认回退屏幕宽度,当无法自动获取设备信息时使用 +FALLBACK_WIDTH = 1080 +# 默认回退屏幕高度,当无法自动获取设备信息时使用 +FALLBACK_HEIGHT = 2400 \ No newline at end of file diff --git a/Apps/XinDianTu/Crawler.py b/Apps/XinDianTu/Crawler.py index 96550b5..2d6e428 100644 --- a/Apps/XinDianTu/Crawler.py +++ b/Apps/XinDianTu/Crawler.py @@ -27,7 +27,7 @@ from Config.Config import ( OBS_TMP_PREFIX, CDN_DOMAIN, SCROLL_DISTANCE_RATIO, MAX_SCROLLS, REDIS_STATION_EXPIRE, WAIT_DETAIL_PAGE_LOAD, WAIT_BACK_TO_LIST, WAIT_AFTER_SCROLL, - MAX_CRAWL_DISTANCE + MAX_CRAWL_DISTANCE, TEMP_IMAGE_DIR ) # --- 用户配置区域 --- @@ -128,7 +128,7 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS): image_uuid = str(uuid.uuid4()) # 使用相对路径: 基于当前脚本目录下的 Images 文件夹 base_dir = os.path.dirname(os.path.abspath(__file__)) - save_dir = os.path.join(base_dir, "./Images") + save_dir = TEMP_IMAGE_DIR screenshot_path = take_screenshot(d, image_uuid, save_dir=save_dir) logger.info(f"Step [1/6] 列表页截图已完成: {screenshot_path} (耗时: {time.time() - t_shot:.2f}s)") @@ -383,7 +383,7 @@ async def get_station_list(d, service, uploader, max_scrolls=MAX_SCROLLS): # 使用几何特征识别 "全部时段" 按钮 # 临时截图 temp_uuid = "temp_find_expand" - screenshot_path = take_screenshot(d, temp_uuid, save_dir="Temp") + screenshot_path = take_screenshot(d, temp_uuid, save_dir=TEMP_IMAGE_DIR) # 尝试识别,将调试图片保存到 Images 目录 t_find = time.time() diff --git a/Util/Kit.py b/Apps/XinDianTu/Kit.py similarity index 96% rename from Util/Kit.py rename to Apps/XinDianTu/Kit.py index c7017aa..67aa0cd 100644 --- a/Util/Kit.py +++ b/Apps/XinDianTu/Kit.py @@ -3,7 +3,7 @@ import os import cv2 import numpy as np import time -from Config.Config import BOTTOM_SAFE_EXCLUDE_RATIO +from Config.Config import BOTTOM_SAFE_EXCLUDE_RATIO, TEMP_IMAGE_DIR logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) @@ -29,7 +29,7 @@ def save_image(path, img): return False # 截图 -def take_screenshot(d, image_uuid, save_dir="Screenshot"): +def take_screenshot(d, image_uuid, save_dir=TEMP_IMAGE_DIR): path = f"{save_dir}/{image_uuid}.jpg" os.makedirs(save_dir, exist_ok=True) d.screenshot(path) @@ -61,7 +61,7 @@ def click_image_template(d, template_path, timeout=5.0, threshold=0.8): while time.time() - start_time < timeout: # 临时截图 temp_uuid = "temp_click_check" - screenshot_path = take_screenshot(d, temp_uuid, save_dir="Temp") + screenshot_path = take_screenshot(d, temp_uuid, save_dir=TEMP_IMAGE_DIR) target = read_image(screenshot_path) if target is None: diff --git a/Apps/XinDianTu/Opener.py b/Apps/XinDianTu/Opener.py index 05ee8d7..1656b28 100644 --- a/Apps/XinDianTu/Opener.py +++ b/Apps/XinDianTu/Opener.py @@ -8,7 +8,7 @@ import uiautomator2 as u2 from Util.Kit import take_screenshot, detect_black_agree_button, click_image_template, detect_ad_close_x, detect_any_ad_close, detect_bottom_close_circle from Util.ObsUtil import ObsUploader from Util.XinDianTuReadImageKit import XinDianTuReadImageKit -from Config.Config import OBS_TMP_PREFIX, CDN_DOMAIN +from Config.Config import OBS_TMP_PREFIX, CDN_DOMAIN, TEMP_IMAGE_DIR # pip install adbutils # 配置日志输出,方便调试和监控 @@ -29,7 +29,7 @@ async def check_and_close_ad(d): image_uuid = str(uuid.uuid4()) # 使用相对路径: 基于当前脚本目录下的 Images 文件夹 base_dir = os.path.dirname(os.path.abspath(__file__)) - save_dir = os.path.join(base_dir, "./Images") + save_dir = TEMP_IMAGE_DIR screenshot_path = take_screenshot(d, image_uuid, save_dir=save_dir) logger.info(f"Step [广告检测截图] 耗时: {time.time() - t1:.4f}s") diff --git a/Util/XinDianTuReadImageKit.py b/Apps/XinDianTu/XinDianTuReadImageKit.py similarity index 100% rename from Util/XinDianTuReadImageKit.py rename to Apps/XinDianTu/XinDianTuReadImageKit.py diff --git a/Config/Config.py b/Config/Config.py index 46974e0..db0ed57 100644 --- a/Config/Config.py +++ b/Config/Config.py @@ -1,52 +1,21 @@ -# 开发环境不同,配置信息不同 -# Doris V4 -# DORIS_HOST = "10.10.14.204" -# DORIS_PORT = 9030 -# DORIS_FENODES = "10.10.14.204:8030" -# REDIS_HOST = '10.10.14.14' -# REDIS_PASSWORD = None # 如果没有密码则设为 None -DORIS_HOST = "www.hzkjai.com" -DORIS_PORT = 27025 -DORIS_FENODES = "www.hzkjai.com:27024" -REDIS_HOST = '127.0.0.1' -REDIS_PASSWORD = "DsideaL147258369" +# 黄海在公司内网开发时的配置信息 +DORIS_HOST = "10.10.14.204" +DORIS_PORT = 9030 +DORIS_FENODES = "10.10.14.204:8030" +REDIS_HOST = '10.10.14.14' +REDIS_PASSWORD = None # 如果没有密码则设为 None -# 采集配置 -# 滑动距离比例 (0.1 ~ 0.9),数值越大滑动幅度越大,建议 0.3-0.5 以避免错过中间内容 -SCROLL_DISTANCE_RATIO = 0.3 -# 最大滑动/翻页次数,达到此次数后停止采集 -MAX_SCROLLS = 100 -# 默认抓取半径(公里),当检测到场站距离超过此值时停止采集 -MAX_CRAWL_DISTANCE = 50 -# 场站去重过期时间(秒),在此时间内重复出现的场站不会再次点击进入详情页 -REDIS_STATION_EXPIRE = 120 - -# 数据库数据保留时长(天),超过此时长的历史数据(is_current=0)将被删除 -DATA_RETENTION_DAYS = 365 - -# 等待时间配置 (秒) -# 点击进入详情页后等待加载的时间 -WAIT_DETAIL_PAGE_LOAD = 2.5 -# 从详情页返回列表页后等待页面刷新的时间 -WAIT_BACK_TO_LIST = 1.5 -# 执行滑动操作后等待页面内容加载和稳定的时间 -WAIT_AFTER_SCROLL = 3.0 - -# 坐标计算与安全防护 -# 屏幕顶部安全排除比例 (0.0~1.0),此比例区域内不进行点击(避开状态栏、筛选栏等) -SAFE_EXCLUDE_RATIO = 0.20 -# 屏幕底部安全排除比例 (0.0~1.0),此比例区域内不进行点击(避开底部导航栏、功能按钮等) -BOTTOM_SAFE_EXCLUDE_RATIO = 0.1 -# 默认回退屏幕宽度,当无法自动获取设备信息时使用 -FALLBACK_WIDTH = 1080 -# 默认回退屏幕高度,当无法自动获取设备信息时使用 -FALLBACK_HEIGHT = 2400 +# 黄海在家开发时的配置信息 +#DORIS_HOST = "www.hzkjai.com" +#DORIS_PORT = 27025 +#DORIS_FENODES = "www.hzkjai.com:27024" +#REDIS_HOST = '127.0.0.1' +#REDIS_PASSWORD = "DsideaL147258369" # 视觉模型配置 VL_MODEL_NAME = "qwen3-vl-flash" VL_MODEL_NAME_AD = "qwen-vl-max" - # 华为云配置 OBS_AK = "WAFBGJACKDOQZDH1MKZ1" OBS_SK = "dlWTUbqgCICaYJG3n0Rot4jXaen2HnfFtMVxiPEo" @@ -75,10 +44,12 @@ doris = { "database": DORIS_DATABASE } - # REDIS 配置 REDIS_DB = 2 REDIS_DECODE_RESPONSES = True REDIS_PORT = 18890 REDIS_MAX_CONNECTIONS = 200 +# 临时图片存储路径 +TEMP_IMAGE_DIR = r"d:\dsWork\aiData\Output" + diff --git a/Config/__pycache__/Config.cpython-310.pyc b/Config/__pycache__/Config.cpython-310.pyc index 322a907..afc66d1 100644 Binary files a/Config/__pycache__/Config.cpython-310.pyc and b/Config/__pycache__/Config.cpython-310.pyc differ diff --git a/Apps/Json/Ref/T6_PaChongGaoDe.py b/Json/Ref/T6_PaChongGaoDe.py similarity index 100% rename from Apps/Json/Ref/T6_PaChongGaoDe.py rename to Json/Ref/T6_PaChongGaoDe.py diff --git a/Apps/Json/Ref/T7_PaChongGaoDePatch.py b/Json/Ref/T7_PaChongGaoDePatch.py similarity index 100% rename from Apps/Json/Ref/T7_PaChongGaoDePatch.py rename to Json/Ref/T7_PaChongGaoDePatch.py diff --git a/Apps/Json/amap_cookies.json b/Json/amap_cookies.json similarity index 100% rename from Apps/Json/amap_cookies.json rename to Json/amap_cookies.json diff --git a/Output/current_20260112_080855.jpg b/Output/current_20260112_080855.jpg new file mode 100644 index 0000000..5a2c816 Binary files /dev/null and b/Output/current_20260112_080855.jpg differ diff --git a/run.py b/T2_XinDianTu.py similarity index 100% rename from run.py rename to T2_XinDianTu.py diff --git a/Temp/DevTools/current_20260111_170841.jpg b/Temp/DevTools/current_20260111_170841.jpg deleted file mode 100644 index cbf7c07..0000000 Binary files a/Temp/DevTools/current_20260111_170841.jpg and /dev/null differ diff --git a/Temp/DevTools/current_20260111_171025.jpg b/Temp/DevTools/current_20260111_171025.jpg deleted file mode 100644 index 1b8cfba..0000000 Binary files a/Temp/DevTools/current_20260111_171025.jpg and /dev/null differ diff --git a/Test/1.jpg b/Test/1.jpg deleted file mode 100644 index b5b33da..0000000 Binary files a/Test/1.jpg and /dev/null differ diff --git a/Test/1_bw.jpg b/Test/1_bw.jpg deleted file mode 100644 index 6157cf3..0000000 Binary files a/Test/1_bw.jpg and /dev/null differ diff --git a/Test/1_bw_bytes.jpg b/Test/1_bw_bytes.jpg deleted file mode 100644 index 6157cf3..0000000 Binary files a/Test/1_bw_bytes.jpg and /dev/null differ diff --git a/Test/2.jpg b/Test/2.jpg deleted file mode 100644 index b80d77b..0000000 Binary files a/Test/2.jpg and /dev/null differ diff --git a/Test/TestConflict.py b/Test/TestConflict.py deleted file mode 100644 index f64cbca..0000000 --- a/Test/TestConflict.py +++ /dev/null @@ -1,29 +0,0 @@ -# coding=utf-8 -import os -import sys -import time -import numpy as np - -# Add project root to sys.path -current_dir = os.path.dirname(os.path.abspath(__file__)) -# project_root = os.path.dirname(current_dir) -# if project_root not in sys.path: -# sys.path.append(project_root) - -print("Importing PaddleOCR...") -from paddleocr import PaddleOCR -print("Importing LlmUtil...") -# from Util.LlmUtil import get_llm_response -print("Imports done.") - -def main(): - image_path = os.path.join(current_dir, "2.jpg") - print(f"Initializing OCR...") - ocr = PaddleOCR(use_textline_orientation=True, lang="ch") - print("Running OCR...") - result = ocr.ocr(image_path) - print("OCR Done.") - print(result) - -if __name__ == "__main__": - main() diff --git a/Test/TestOCR.py b/Test/TestOCR.py deleted file mode 100644 index 2dfec1f..0000000 --- a/Test/TestOCR.py +++ /dev/null @@ -1,119 +0,0 @@ -# coding=utf-8 -import os -import sys -from paddleocr import PaddleOCR -import numpy as np - -def test_ocr(): - # 1. 初始化 PaddleOCR - print("正在初始化 PaddleOCR 模型...") - try: - ocr = PaddleOCR(use_textline_orientation=True, lang="ch") - except Exception as e: - print(f"初始化失败: {e}") - return - - # 2. 准备测试图片 - current_dir = os.path.dirname(os.path.abspath(__file__)) - # 优先查找 2.jpg,如果不存在则查找 1.jpg - image_path = os.path.join(current_dir, "2.jpg") - if not os.path.exists(image_path): - image_path = os.path.join(current_dir, "1.jpg") - - if not os.path.exists(image_path): - print(f"错误: 未找到测试图片: {image_path}") - print("请将测试图片命名为 2.jpg 或 1.jpg 并放置在 Test 目录下。") - return - - print(f"正在识别图片: {image_path}") - - # 3. 执行识别 - try: - # result 是一个列表,通常包含一个 OCRResult 对象 (新版) 或列表 (旧版) - result = ocr.ocr(image_path) - - except Exception as e: - print(f"识别过程发生异常: {e}") - return - - # 4. 输出结果 - print("\n" + "="*20 + " 识别结果 " + "="*20) - sys.stdout.flush() - - # 同时输出到文件,方便查看 - output_file = os.path.join(current_dir, "ocr_output.txt") - with open(output_file, "w", encoding="utf-8") as f_out: - if not result: - msg = "未识别到任何文字 (Result is empty)。" - print(msg) - f_out.write(msg + "\n") - else: - # 取出第一个结果(通常是单张图片的结果) - res = result[0] - - # 检查是否为 None - if res is None: - msg = "未识别到任何文字 (Result[0] is None)。" - print(msg) - f_out.write(msg + "\n") - - # 情况 A: 新版 PaddleX OCRResult 对象 (表现为字典或对象) - elif hasattr(res, 'get') and 'rec_texts' in res: - texts = res.get('rec_texts', []) - scores = res.get('rec_scores', []) - - if not texts: - msg = "未识别到任何文字 (rec_texts is empty)。" - print(msg) - f_out.write(msg + "\n") - else: - for i, text in enumerate(texts): - score = scores[i] if i < len(scores) else 0.0 - msg = f"行 {i+1}: {text} (置信度: {score:.4f})" - print(msg) - f_out.write(msg + "\n") - - # 情况 B: 对象属性访问 - elif hasattr(res, 'rec_texts'): - texts = res.rec_texts - scores = res.rec_scores - - if not texts: - msg = "未识别到任何文字 (rec_texts is empty)。" - print(msg) - f_out.write(msg + "\n") - else: - for i, text in enumerate(texts): - score = scores[i] if i < len(scores) else 0.0 - msg = f"行 {i+1}: {text} (置信度: {score:.4f})" - print(msg) - f_out.write(msg + "\n") - - # 情况 C: 旧版 list of lists 结构 - elif isinstance(res, list): - for idx, line in enumerate(res): - try: - if len(line) >= 2 and isinstance(line[1], (tuple, list)): - text, score = line[1] - msg = f"行 {idx+1}: {text} (置信度: {score:.4f})" - print(msg) - f_out.write(msg + "\n") - else: - msg = f"行 {idx+1}: {line} (格式未知)" - print(msg) - f_out.write(msg + "\n") - except Exception as e: - print(f"行 {idx+1} 解析失败: {e}") - - else: - msg = f"无法解析结果结构: {type(res)}" - print(msg) - print(f"Result content: {res}") - f_out.write(msg + "\n") - f_out.write(f"Result content: {res}\n") - - print("="*50) - sys.stdout.flush() - -if __name__ == "__main__": - test_ocr() diff --git a/Test/TestOcrLlm.py b/Test/TestOcrLlm.py deleted file mode 100644 index 1d5d4d3..0000000 --- a/Test/TestOcrLlm.py +++ /dev/null @@ -1,173 +0,0 @@ -# coding=utf-8 -import os -import sys -import time -import asyncio -import json - -# Add project root to sys.path -current_dir = os.path.dirname(os.path.abspath(__file__)) -project_root = os.path.dirname(current_dir) -if project_root not in sys.path: - sys.path.append(project_root) - -from paddleocr import PaddleOCR -from Util.LlmUtil import get_llm_response -from Util.OcrParser import OcrParser -import re - -LOG_FILE = os.path.join(current_dir, "ocr_llm_debug.txt") - -def log(msg): - print(msg) - sys.stdout.flush() - with open(LOG_FILE, "a", encoding="utf-8") as f: - f.write(msg + "\n") - -# ... imports ... -def run_ocr_sync(): - image_path = os.path.join(current_dir, "2.jpg") - if not os.path.exists(image_path): - image_path = os.path.join(current_dir, "1.jpg") - - log(f"Testing OCR + LLM Pipeline on: {image_path}") - log("-" * 50) - - # --- Step 1: PaddleOCR --- - t_start = time.time() - - log("Initializing PaddleOCR...") - t_init_start = time.time() - try: - # 尝试使用轻量级模型 (Mobile) 以提升速度 - # ocr_version='PP-OCRv4' 通常默认是 mobile - ocr = PaddleOCR(use_textline_orientation=True, lang="ch", ocr_version='PP-OCRv4') - except Exception as e: - log(f"PaddleOCR Init Failed: {e}") - return None, None - - t_init_end = time.time() - log(f"PaddleOCR Init Time: {t_init_end - t_init_start:.4f}s") - - log("Running OCR Inference...") - t_ocr_start = time.time() - try: - result = ocr.ocr(image_path) - except Exception as e: - log(f"OCR Inference Failed: {e}") - return None, None - t_ocr_end = time.time() - - ocr_text_lines = [] - - # Handle different result structures - if not result: - log("OCR returned empty result.") - else: - res = result[0] - if res is None: - log("OCR result[0] is None.") - elif hasattr(res, 'get') and 'rec_texts' in res: - ocr_text_lines = res.get('rec_texts', []) - elif hasattr(res, 'rec_texts'): - ocr_text_lines = res.rec_texts - elif isinstance(res, list): - for line in res: - if len(line) >= 2 and isinstance(line[1], (tuple, list)): - ocr_text_lines.append(line[1][0]) - - ocr_text_block = "\n".join(ocr_text_lines) - log(f"OCR Result ({t_ocr_end - t_ocr_start:.4f}s):") - log(ocr_text_block) - log("-" * 50) - - return ocr_text_lines, ocr_text_block, (t_ocr_start, t_ocr_end) - -async def run_parsing_comparison(ocr_text_lines, ocr_text_block, timing_ocr): - t_ocr_start, t_ocr_end = timing_ocr - ocr_duration = t_ocr_end - t_ocr_start - - # --- Mode 1: Regex Parsing --- - log("Running Regex Parsing...") - t_regex_start = time.time() - try: - regex_data = OcrParser.parse(ocr_text_lines) - log("\nParsed Data (Regex):") - log(json.dumps(regex_data, indent=2, ensure_ascii=False)) - except Exception as e: - log(f"Regex Parsing Failed: {e}") - t_regex_end = time.time() - regex_duration = t_regex_end - t_regex_start - log(f"Regex Parsing Time: {regex_duration:.4f}s") - log("-" * 50) - - # --- Mode 2: LLM Parsing --- - log("Running LLM Parsing...") - - prompt = f""" - You are a data extraction assistant. Below is the OCR text recognized from a charging station list card. - Please extract the structured data and return it ONLY as a JSON object (no markdown, no extra text). - - Fields to extract: - - station_name: (String) Name of the charging station. - - distance: (String) Distance info (e.g., "7.4km"). - - price: (String) Price info (e.g., "0.7111/度"). - - tags: (List[String]) Any tags like "快", "闲3/4", "组团", "2倍积分", "P", etc. - - parking_info: (String) Parking related info. - - OCR Text: - {ocr_text_block} - """ - - t_llm_start = time.time() - response_text = "" - try: - log("Starting LLM request...") - async for chunk in get_llm_response(prompt, stream=True): - print(chunk, end='', flush=True) - response_text += chunk - - print("\n") - log("LLM request finished.") - - t_llm_end = time.time() - - log(f"\nLLM Response ({t_llm_end - t_llm_start:.4f}s):") - log(response_text) - - try: - clean_text = response_text.replace("```json", "").replace("```", "").strip() - data = json.loads(clean_text) - log("\nParsed JSON Data:") - log(json.dumps(data, indent=2, ensure_ascii=False)) - except json.JSONDecodeError: - log("\nFailed to parse JSON directly.") - - except Exception as e: - log(f"LLM Error: {e}") - t_llm_end = time.time() - - log("-" * 50) - log(f"Summary:") - log(f"OCR Time: {ocr_duration:.4f}s") - log(f"Regex Parsing Time: {regex_duration:.4f}s") - log(f"LLM Parsing Time: {t_llm_end - t_llm_start:.4f}s") - - total_regex = ocr_duration + regex_duration - total_llm = ocr_duration + (t_llm_end - t_llm_start) - - log(f"Total Pipeline (OCR+Regex): {total_regex:.4f}s") - log(f"Total Pipeline (OCR+LLM): {total_llm:.4f}s") - -def main(): - # Clear log file - with open(LOG_FILE, "w", encoding="utf-8") as f: - f.write("Starting TestOcrLlm...\n") - - ocr_lines, ocr_text, timing = run_ocr_sync() - if ocr_lines: - asyncio.run(run_parsing_comparison(ocr_lines, ocr_text, timing)) - -if __name__ == "__main__": - main() - diff --git a/Test/TestOcrRegex.py b/Test/TestOcrRegex.py deleted file mode 100644 index fc2bcca..0000000 --- a/Test/TestOcrRegex.py +++ /dev/null @@ -1,107 +0,0 @@ -# coding=utf-8 -import re -import json - -def parse_ocr_lines(lines): - result = { - "station_name": "", - "distance": "", - "price": None, - "piles": [], - "parking": "", - "tags": [] - } - - # Pre-process lines: remove confidence scores for parsing - clean_lines = [] - for line in lines: - # Remove prefix "行 X: " - text = re.sub(r"^行\s*\d+:\s*", "", line) - # Remove suffix "(置信度: ...)" - text = re.sub(r"\s*\(置信度:.*?\)\s*$", "", text) - clean_lines.append(text.strip()) - - # 1. Station Name - for line in clean_lines: - # Skip empty or short noise - if len(line) < 2: continue - # Skip if starts with special chars - if line.startswith("(") or line.startswith("("): continue - # Skip if contains typical attribute keywords - if "km" in line.lower() or "/度" in line or "¥" in line: continue - # Skip if strictly numeric (unlikely for name) - if re.match(r"^\d+$", line): continue - - result["station_name"] = line - break - - # 2. Distance - for line in clean_lines: - m = re.search(r"(\d+(\.\d+)?)\s*km", line, re.IGNORECASE) - if m: - result["distance"] = m.group(0) - break - - # 3. Price (Standard) - # Look for "0.xxxx/度" - for line in clean_lines: - m = re.search(r"(\d+\.\d+)(?=/度)", line) - if m: - result["price"] = float(m.group(1)) - break - - # 4. Piles - current_type = "未知" - for line in clean_lines: - if "快" in line: current_type = "快" - elif "慢" in line: current_type = "慢" - elif "超" in line: current_type = "超" - - # Match "闲3/4" or "3/4" - # Regex: optional "闲", int, /, int - m = re.search(r"(?:闲)?(\d+)/(\d+)", line) - if m: - # Check if it looks like a price (contains dot) - if "." in line: continue - - free = int(m.group(1)) - total = int(m.group(2)) - result["piles"].append({ - "type": current_type, - "free": free, - "total": total - }) - - # 5. Parking - for line in clean_lines: - if "停车" in line: - # Clean up leading punctuation - cleaned = re.sub(r"^[·\.\sP]+", "", line) - result["parking"] = cleaned - break - - return result - -if __name__ == "__main__": - # User provided sample data - sample_input = [ - "行 1: 长春市绿园区雁鸣湖公共充电站 (置信度: 0.9963)", - "行 2: (… (置信度: 0.6244)", - "行 3: 7.4km (置信度: 0.9975)", - "行 4: 0.7111/度 (置信度: 0.9450)", - "行 5: 快 (置信度: 0.9987)", - "行 6: 闲3/4 (置信度: 0.9941)", - "行 7: ¥ (置信度: 0.8734)", - "行 8: 组团 (置信度: 0.9995)", - "行 9: 2倍积分 (置信度: 0.9997)", - "行 10: P (置信度: 0.9929)", - "行 11: ·收费停车:以场地实际收费为准 (置信度: 0.9736)" - ] - - print("--- Input Data ---") - for l in sample_input: - print(l) - - parsed = parse_ocr_lines(sample_input) - print("\n--- Parsed Result (Regex) ---") - print(json.dumps(parsed, ensure_ascii=False, indent=2)) diff --git a/Test/TestPaddleOCRKit.py b/Test/TestPaddleOCRKit.py deleted file mode 100644 index a933f2b..0000000 --- a/Test/TestPaddleOCRKit.py +++ /dev/null @@ -1,118 +0,0 @@ -# coding=utf-8 -import os -import sys -import cv2 -import json -import time - -# Add project root to sys.path -current_dir = os.path.dirname(os.path.abspath(__file__)) -project_root = os.path.dirname(current_dir) -if project_root not in sys.path: - sys.path.append(project_root) - -from Util import Kit -from Util.PaddleOCRKit import get_ocr_kit - -def test_integration(): - image_path = os.path.join(current_dir, "2.jpg") - if not os.path.exists(image_path): - print(f"Image not found: {image_path}") - return - - print(f"Testing integration on: {image_path}") - - # 1. Generate Metadata using Kit - print("Running Kit.crop_cards_from_image...") - Kit.crop_cards_from_image(image_path, output_dir=current_dir) - - json_path = image_path.replace(".jpg", ".json") - json_metadata = {} - - if os.path.exists(json_path): - with open(json_path, 'r', encoding='utf-8') as f: - json_metadata = json.load(f) - - if not json_metadata.get("cards"): - print("Kit failed to find cards (expected for single card image). Mocking metadata.") - img = cv2.imread(image_path) - h, w = img.shape[:2] - json_metadata = { - "cards": [ - { - "id": 1, - "rect": [0, 0, w, h], - "click_point": [w//2, h//2] - } - ] - } - - print(f"Loaded metadata with {len(json_metadata.get('cards', []))} cards.") - - # 2. Run OCR Logic (Simulating Crawler.py) - print("Running OCR Logic...") - - ocr_kit = get_ocr_kit() - - original_img = cv2.imread(image_path) - stations = [] - - t_start = time.time() - - if "cards" in json_metadata and original_img is not None: - h_img, w_img = original_img.shape[:2] - - for card in json_metadata["cards"]: - rect = card.get("rect") - if not rect: continue - x1, y1, x2, y2 = rect - - # 边界检查 - x1 = max(0, min(x1, w_img)) - x2 = max(0, min(x2, w_img)) - y1 = max(0, min(y1, h_img)) - y2 = max(0, min(y2, h_img)) - - if x2 <= x1 or y2 <= y1: continue - - # 裁剪卡片 - cropped_card = original_img[y1:y2, x1:x2] - - # 识别 - parsed_data = ocr_kit.recognize(cropped_card) - print(f"Parsed Data: {json.dumps(parsed_data, indent=2, ensure_ascii=False)}") - - if parsed_data and parsed_data.get("station_name"): - # 格式化数据 - piles_list = parsed_data.get("piles", []) - piles_str_parts = [] - for p in piles_list: - p_type = p.get("type", "") - p_free = p.get("free", 0) - p_total = p.get("total", 0) - piles_str_parts.append(f"{p_type}:{p_free}/{p_total}") - - piles_str = " ".join(piles_str_parts) - - station_info = { - "station_name": parsed_data.get("station_name"), - "price": str(parsed_data.get("price")) if parsed_data.get("price") is not None else "", - "piles": piles_str, - "distance": parsed_data.get("distance", ""), - "uia_center_x": card["click_point"][0], - "uia_center_y": card["click_point"][1], - "tags": parsed_data.get("tags", []), - "parking_info": parsed_data.get("parking", "") - } - stations.append(station_info) - - t_end = time.time() - - print("-" * 50) - print(f"Total Processing Time: {t_end - t_start:.4f}s") - print(f"Found {len(stations)} stations:") - for s in stations: - print(json.dumps(s, indent=2, ensure_ascii=False)) - -if __name__ == "__main__": - test_integration() diff --git a/Test/__init__.py b/Test/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/Test/__pycache__/__init__.cpython-310.pyc b/Test/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index d573e7a..0000000 Binary files a/Test/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/Test/home.jpg b/Test/home.jpg deleted file mode 100644 index b0ab130..0000000 Binary files a/Test/home.jpg and /dev/null differ diff --git a/Test/ocr_llm_debug.txt b/Test/ocr_llm_debug.txt deleted file mode 100644 index 6ffccbe..0000000 --- a/Test/ocr_llm_debug.txt +++ /dev/null @@ -1,78 +0,0 @@ -Starting TestOcrLlm... -Testing OCR + LLM Pipeline on: D:\dsWork\YltProject\dsCrawler\Test\2.jpg --------------------------------------------------- -Initializing PaddleOCR... -PaddleOCR Init Time: 1.7990s -Running OCR Inference... -OCR Result (1.7636s): -长春市绿园区雁鸣湖公共充电站 -.. -17.4km -0.7111/度 - -快 -闲3/4 -组团 -2倍积分 -P -收费停车:以场地实际收费为准 --------------------------------------------------- -Running Regex Parsing... - -Parsed Data (Regex): -{ - "station_name": "长春市绿园区雁鸣湖公共充电站", - "distance": "17.4km", - "price": 0.7111, - "piles": [ - { - "type": "快", - "free": 3, - "total": 4 - } - ], - "parking": "收费停车:以场地实际收费为准", - "tags": [ - "组团", - "2倍积分", - "P" - ] -} -Regex Parsing Time: 0.0015s --------------------------------------------------- -Running LLM Parsing... -Starting LLM request... -LLM request finished. - -LLM Response (3.6250s): -```json -{ - "station_name": "长春市绿园区雁鸣湖公共充电站", - "distance": "17.4km", - "price": "0.7111/度", - "tags": ["快", "闲3/4", "组团", "2倍积分", "P"], - "parking_info": "收费停车:以场地实际收费为准" -} -``` - -Parsed JSON Data: -{ - "station_name": "长春市绿园区雁鸣湖公共充电站", - "distance": "17.4km", - "price": "0.7111/度", - "tags": [ - "快", - "闲3/4", - "组团", - "2倍积分", - "P" - ], - "parking_info": "收费停车:以场地实际收费为准" -} --------------------------------------------------- -Summary: -OCR Time: 1.7636s -Regex Parsing Time: 0.0015s -LLM Parsing Time: 3.6250s -Total Pipeline (OCR+Regex): 1.7652s -Total Pipeline (OCR+LLM): 5.3886s diff --git a/Test/ocr_llm_output.txt b/Test/ocr_llm_output.txt deleted file mode 100644 index 6206456..0000000 Binary files a/Test/ocr_llm_output.txt and /dev/null differ diff --git a/Test/ocr_llm_result.txt b/Test/ocr_llm_result.txt deleted file mode 100644 index dad011f..0000000 --- a/Test/ocr_llm_result.txt +++ /dev/null @@ -1,27 +0,0 @@ -OCR Result: -长春市绿园区雁鸣湖公共充电站 -(… -7.4km -0.7111/度 -快 -闲3/4 -¥ -组团 -2倍积分 -P -·收费停车:以场地实际收费为准 - -LLM Response: -{ - "station_name": "长春市绿园区雁鸣湖公共充电站", - "distance": "7.4km", - "price": "0.7111/度", - "tags": ["快", "闲3/4", "组团", "2倍积分", "P"], - "parking_info": "收费停车:以场地实际收费为准" -} - --------------------------------------------------- -Summary: -OCR Time: 15.7442s -LLM Time: 3.1676s -Total Pipeline Time (excluding init): 18.9118s \ No newline at end of file diff --git a/Test/ocr_output.txt b/Test/ocr_output.txt deleted file mode 100644 index b7c5740..0000000 --- a/Test/ocr_output.txt +++ /dev/null @@ -1,11 +0,0 @@ -行 1: 长春市绿园区雁鸣湖公共充电站 (置信度: 0.9963) -行 2: (… (置信度: 0.6244) -行 3: 7.4km (置信度: 0.9975) -行 4: 0.7111/度 (置信度: 0.9450) -行 5: 快 (置信度: 0.9987) -行 6: 闲3/4 (置信度: 0.9941) -行 7: ¥ (置信度: 0.8734) -行 8: 组团 (置信度: 0.9995) -行 9: 2倍积分 (置信度: 0.9997) -行 10: P (置信度: 0.9929) -行 11: ·收费停车:以场地实际收费为准 (置信度: 0.9736) diff --git a/Test/qbsd_bw.jpg b/Test/qbsd_bw.jpg deleted file mode 100644 index 0a0ecf7..0000000 Binary files a/Test/qbsd_bw.jpg and /dev/null differ diff --git a/Test/test02.py b/Test/test02.py deleted file mode 100644 index 9220fd5..0000000 --- a/Test/test02.py +++ /dev/null @@ -1,9 +0,0 @@ -import uiautomator2 as u2 - -d = u2.connect() - -d.reset_uiautomator() -d.sleep(3) -xml = d.dump_hierarchy() # 导出当前UI结构(XML) -with open("ui_hierarchy5_2.xml", "w", encoding="utf-8") as f: - f.write(xml) \ No newline at end of file diff --git a/Test/test03.py b/Test/test03.py deleted file mode 100644 index 56ee179..0000000 --- a/Test/test03.py +++ /dev/null @@ -1,14 +0,0 @@ -from Util.ObsUtil import ObsUploader - -from Config.Config import OBS_TMP_PREFIX - -uploader = ObsUploader() - -object_key = OBS_TMP_PREFIX + "/1d3eb56c-942e-42d3-8993-f1ea8ad7d97b.jpg" -success, result = uploader.upload_file( - object_key=object_key, - file_path="Screenshot/1d3eb56c-942e-42d3-8993-f1ea8ad7d97b.jpg" -) - -print(success) -print(result) diff --git a/Test/test04.py b/Test/test04.py deleted file mode 100644 index dd4bd0d..0000000 --- a/Test/test04.py +++ /dev/null @@ -1,12 +0,0 @@ -import uiautomator2 as u2 - -d = u2.connect() - -# 启动应用(以微信为例) -d.app_start("com.tencent.mm") - -# 截微信图保存 -d.screenshot("home.jpg") - -# 关闭应用 -d.app_stop("com.tencent.mm") \ No newline at end of file diff --git a/Test/ui_hierarchy5_2.xml b/Test/ui_hierarchy5_2.xml deleted file mode 100644 index 96a6655..0000000 --- a/Test/ui_hierarchy5_2.xml +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/Sql/doris_ddl.sql b/Tools/Sql/doris_ddl.sql similarity index 100% rename from Sql/doris_ddl.sql rename to Tools/Sql/doris_ddl.sql diff --git a/Sql/doris_ddl.py b/Tools/T1_CreateTable.py similarity index 93% rename from Sql/doris_ddl.py rename to Tools/T1_CreateTable.py index 6fcf6b3..11907fa 100644 --- a/Sql/doris_ddl.py +++ b/Tools/T1_CreateTable.py @@ -8,7 +8,6 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(level logger = logging.getLogger(__name__) # 将项目根目录添加到系统路径,以便能够导入 DbKit 和 Config 等模块 -# DbKit/doris_ddl.py -> DbKit -> Root (2 levels up) project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if project_root not in sys.path: sys.path.append(project_root) @@ -31,7 +30,7 @@ async def init_tables(): logger.info("正在读取 SQL 文件...") # SQL 文件路径:相对于当前脚本所在目录 (DbKit/) - sql_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '', 'doris_ddl.sql') + sql_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '', 'Sql/doris_ddl.sql') logger.info(f"使用 SQL 文件: {sql_path}") diff --git a/Util/DevTools/screenshot_test.py b/Tools/T2_ScreenShot.py similarity index 74% rename from Util/DevTools/screenshot_test.py rename to Tools/T2_ScreenShot.py index 3c26658..46451b8 100644 --- a/Util/DevTools/screenshot_test.py +++ b/Tools/T2_ScreenShot.py @@ -5,6 +5,8 @@ import os from datetime import datetime import uiautomator2 as u2 +from Config.Config import TEMP_IMAGE_DIR + # 配置日志输出 logging.basicConfig( level=logging.INFO, @@ -22,14 +24,8 @@ async def take_screenshot(): # 执行截图 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"current_{timestamp}.jpg" - - # 获取目标目录: 项目根目录下的 Temp/DevTools - project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - target_dir = os.path.join(project_root, "Temp", "DevTools") - os.makedirs(target_dir, exist_ok=True) - filepath = os.path.join(target_dir, filename) - + filename = f"Screenshot_{timestamp}.jpg" + filepath = os.path.join(TEMP_IMAGE_DIR, filename) logger.info(f"正在拍照(截图)并保存至: {filepath}") d.screenshot(filepath) diff --git a/Util/DevTools/crop_test.py b/Util/DevTools/crop_test.py deleted file mode 100644 index 7c57f6d..0000000 --- a/Util/DevTools/crop_test.py +++ /dev/null @@ -1,73 +0,0 @@ -# -*- coding: utf-8 -*- -""" -基于行扫描和统计特征的卡片截取工具 -""" -import sys -import os - -# 添加项目根目录到系统路径 -project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -sys.path.append(project_root) -from Util import Kit - -def clean_directory(dir_path): - """ - 清理现场:删除目录下除纯数字.jpg以外的所有文件 - """ - print(f"Cleaning directory: {dir_path}") - if not os.path.exists(dir_path): - return - - for filename in os.listdir(dir_path): - path = os.path.join(dir_path, filename) - if not os.path.isfile(path): - continue - - name, ext = os.path.splitext(filename) - # 保留条件:后缀是 .jpg 且文件名是纯数字 - # [修改] 同时保留 _flag.jpg, _vl.jpg 供查看 - # [修改] 保留 .json 文件 - if ext.lower() == ".json": - continue - - if ext.lower() == ".jpg" and (name.isdigit() or name.endswith("_flag") or name.endswith("_vl")): - continue - - # 清理旧的 _for_vl.jpg - if name.endswith("_for_vl"): - pass # Let it fall through to delete - else: - # 其他非生成的文件,可能需要保留吗? - # 这里的逻辑是清理 output 目录,假设该目录下只有生成的图片 - # 如果是原始图片(比如 1.jpg),不能删! - pass - - # 原始图片保护 (文件名比较短,通常是 1.jpg, 2.jpg 等) - if len(name) <= 2 and name.isdigit(): - continue - - try: - os.remove(path) - print(f" Deleted: {filename}") - except Exception as e: - print(f" Error deleting {filename}: {e}") - -def crop_cards(img_path): - Kit.crop_cards_from_image(img_path) - -if __name__ == "__main__": - test_files = [ - r"d:\dsWork\dsProject\dsCrawler\Tools\Images\1.jpg", - r"d:\dsWork\dsProject\dsCrawler\Tools\Images\2.jpg", - r"d:\dsWork\dsProject\dsCrawler\Tools\Images\3.jpg", - r"d:\dsWork\dsProject\dsCrawler\Tools\Images\4.jpg" - ] - - # 在测试前清理现场 - if test_files: - # 假设所有图片都在同一个文件夹 - target_dir = os.path.dirname(test_files[0]) - clean_directory(target_dir) - - for f in test_files: - crop_cards(f) diff --git a/Util/__pycache__/Kit.cpython-310.pyc b/Util/__pycache__/Kit.cpython-310.pyc deleted file mode 100644 index 4cb929d..0000000 Binary files a/Util/__pycache__/Kit.cpython-310.pyc and /dev/null differ diff --git a/Util/__pycache__/Win32Patch.cpython-310.pyc b/Util/__pycache__/Win32Patch.cpython-310.pyc index 06616be..c379586 100644 Binary files a/Util/__pycache__/Win32Patch.cpython-310.pyc and b/Util/__pycache__/Win32Patch.cpython-310.pyc differ diff --git a/Util/__pycache__/XinDianTuReadImageKit.cpython-310.pyc b/Util/__pycache__/XinDianTuReadImageKit.cpython-310.pyc deleted file mode 100644 index fb57996..0000000 Binary files a/Util/__pycache__/XinDianTuReadImageKit.cpython-310.pyc and /dev/null differ diff --git a/Util/__pycache__/XinDianTuReadImageKit.cpython-313.pyc b/Util/__pycache__/XinDianTuReadImageKit.cpython-313.pyc deleted file mode 100644 index feea2ea..0000000 Binary files a/Util/__pycache__/XinDianTuReadImageKit.cpython-313.pyc and /dev/null differ