Files
aiData/DataX/tool/LoadCsvToMysql.py

170 lines
5.7 KiB
Python
Raw Normal View History

2026-02-06 07:42:45 +08:00
import pymysql
import sys
import os
import glob
import json
import time
from urllib.parse import urlparse, parse_qs
def parse_jdbc_url(url):
# jdbc:mysql://host:port/db?params
# remove jdbc:mysql://
if url.startswith("jdbc:mysql://"):
url = url[13:]
# split host:port and db
if "/" in url:
address, remainder = url.split("/", 1)
if "?" in remainder:
db, params = remainder.split("?", 1)
else:
db = remainder
params = ""
else:
address = url
db = ""
params = ""
if ":" in address:
host, port = address.split(":")
port = int(port)
else:
host = address
port = 3306
return host, port, db
def load_csv(jdbc_url, user, password, table, csv_dir, columns=None):
host, port, db = parse_jdbc_url(jdbc_url)
print(f"Connecting to MySQL {host}:{port}/{db} as {user}...")
try:
conn = pymysql.connect(
host=host,
port=port,
user=user,
password=password,
database=db,
local_infile=True,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
except Exception as e:
print(f"Connection failed: {e}")
sys.exit(1)
try:
with conn.cursor() as cursor:
# Optimization settings
print("Setting session parameters for speed...")
cursor.execute("SET NAMES utf8mb4")
2026-02-06 08:21:47 +08:00
# 逐个尝试设置优化参数,避免因单个参数(如 SQL_LOG_BIN权限不足导致整体失败
opts = [
("SET FOREIGN_KEY_CHECKS = 0", "Foreign Key Checks Disabled"),
("SET UNIQUE_CHECKS = 0", "Unique Checks Disabled"),
("SET SQL_LOG_BIN = 0", "Binary Logging Disabled (Requires SUPER privilege)")
]
for sql, desc in opts:
try:
cursor.execute(sql)
print(f" - {desc}: Success")
except Exception as e:
# 如果是权限问题 (1227),打印更友好的信息
if "1227" in str(e):
print(f" - {desc}: Skipped (Insufficient privileges, but that's okay)")
else:
print(f" - {desc}: Failed ({e})")
2026-02-06 07:42:45 +08:00
# Truncate table
print(f"Truncating table {table}...")
cursor.execute(f"TRUNCATE TABLE `{table}`")
# Find files
files = glob.glob(os.path.join(csv_dir, "*"))
if not files:
print(f"No files found in {csv_dir}")
return
total_rows = 0
start_time = time.time()
for file_path in files:
file_path = os.path.abspath(file_path).replace('\\', '/')
print(f"Loading file: {file_path}")
# Build SQL
# Assuming DataX txtfilewriter defaults:
# separator: ,
# quoteChar: "
# escapeChar: \
# nullFormat: \N
col_sql = ""
if columns:
col_list = [f"`{c}`" for c in columns]
col_sql = "(" + ", ".join(col_list) + ")"
sql = f"""
LOAD DATA LOCAL INFILE '{file_path}'
INTO TABLE `{table}`
CHARACTER SET utf8mb4
FIELDS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"'
ESCAPED BY '\\\\'
LINES TERMINATED BY '\\n'
{col_sql}
"""
cursor.execute(sql)
rows = cursor.rowcount
total_rows += rows
print(f" -> Loaded {rows} rows")
2026-02-06 08:21:47 +08:00
# 显示 MySQL 警告SHOW WARNINGS的功能用于排查导入差异
try:
cursor.execute("SHOW WARNINGS")
warnings = cursor.fetchall()
if warnings:
print(f" - MySQL Warnings ({len(warnings)}):")
# 最多显示前 5 条警告,避免日志过多
for i, warn in enumerate(warnings[:5]):
print(f" - {warn.get('Level', 'Warning')}: {warn.get('Message', 'Unknown error')}")
if len(warnings) > 5:
print(f" - ... and {len(warnings) - 5} more warnings")
except Exception as warn_e:
print(f" - Could not fetch warnings: {warn_e}")
2026-02-06 07:42:45 +08:00
conn.commit()
duration = time.time() - start_time
print(f"Total loaded: {total_rows} rows in {duration:.2f}s ({total_rows/duration if duration > 0 else 0:.2f} rows/s)")
except Exception as e:
print(f"Error during load: {e}")
sys.exit(1)
finally:
conn.close()
if __name__ == "__main__":
if len(sys.argv) < 6:
print("Usage: python LoadCsvToMysql.py <jdbc_url> <user> <password> <table> <csv_dir> [columns_json]")
sys.exit(1)
jdbc_url = sys.argv[1]
user = sys.argv[2]
password = sys.argv[3]
table = sys.argv[4]
csv_dir = sys.argv[5]
columns = None
if len(sys.argv) > 6:
try:
columns = json.loads(sys.argv[6])
except:
print("Warning: Could not parse columns JSON")
load_csv(jdbc_url, user, password, table, csv_dir, columns)