2026-02-06 07:42:45 +08:00
|
|
|
|
import pymysql
|
|
|
|
|
|
import sys
|
|
|
|
|
|
import os
|
|
|
|
|
|
import glob
|
|
|
|
|
|
import json
|
|
|
|
|
|
import time
|
|
|
|
|
|
from urllib.parse import urlparse, parse_qs
|
|
|
|
|
|
|
|
|
|
|
|
def parse_jdbc_url(url):
|
|
|
|
|
|
# jdbc:mysql://host:port/db?params
|
|
|
|
|
|
# remove jdbc:mysql://
|
|
|
|
|
|
if url.startswith("jdbc:mysql://"):
|
|
|
|
|
|
url = url[13:]
|
|
|
|
|
|
|
|
|
|
|
|
# split host:port and db
|
|
|
|
|
|
if "/" in url:
|
|
|
|
|
|
address, remainder = url.split("/", 1)
|
|
|
|
|
|
if "?" in remainder:
|
|
|
|
|
|
db, params = remainder.split("?", 1)
|
|
|
|
|
|
else:
|
|
|
|
|
|
db = remainder
|
|
|
|
|
|
params = ""
|
|
|
|
|
|
else:
|
|
|
|
|
|
address = url
|
|
|
|
|
|
db = ""
|
|
|
|
|
|
params = ""
|
|
|
|
|
|
|
|
|
|
|
|
if ":" in address:
|
|
|
|
|
|
host, port = address.split(":")
|
|
|
|
|
|
port = int(port)
|
|
|
|
|
|
else:
|
|
|
|
|
|
host = address
|
|
|
|
|
|
port = 3306
|
|
|
|
|
|
|
|
|
|
|
|
return host, port, db
|
|
|
|
|
|
|
|
|
|
|
|
def load_csv(jdbc_url, user, password, table, csv_dir, columns=None):
|
|
|
|
|
|
host, port, db = parse_jdbc_url(jdbc_url)
|
|
|
|
|
|
|
|
|
|
|
|
print(f"Connecting to MySQL {host}:{port}/{db} as {user}...")
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
conn = pymysql.connect(
|
|
|
|
|
|
host=host,
|
|
|
|
|
|
port=port,
|
|
|
|
|
|
user=user,
|
|
|
|
|
|
password=password,
|
|
|
|
|
|
database=db,
|
|
|
|
|
|
local_infile=True,
|
|
|
|
|
|
charset='utf8mb4',
|
|
|
|
|
|
cursorclass=pymysql.cursors.DictCursor
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"Connection failed: {e}")
|
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
with conn.cursor() as cursor:
|
|
|
|
|
|
# Optimization settings
|
|
|
|
|
|
print("Setting session parameters for speed...")
|
|
|
|
|
|
cursor.execute("SET NAMES utf8mb4")
|
2026-02-06 08:21:47 +08:00
|
|
|
|
|
|
|
|
|
|
# 逐个尝试设置优化参数,避免因单个参数(如 SQL_LOG_BIN)权限不足导致整体失败
|
|
|
|
|
|
opts = [
|
|
|
|
|
|
("SET FOREIGN_KEY_CHECKS = 0", "Foreign Key Checks Disabled"),
|
|
|
|
|
|
("SET UNIQUE_CHECKS = 0", "Unique Checks Disabled"),
|
|
|
|
|
|
("SET SQL_LOG_BIN = 0", "Binary Logging Disabled (Requires SUPER privilege)")
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
for sql, desc in opts:
|
|
|
|
|
|
try:
|
|
|
|
|
|
cursor.execute(sql)
|
|
|
|
|
|
print(f" - {desc}: Success")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
# 如果是权限问题 (1227),打印更友好的信息
|
|
|
|
|
|
if "1227" in str(e):
|
|
|
|
|
|
print(f" - {desc}: Skipped (Insufficient privileges, but that's okay)")
|
|
|
|
|
|
else:
|
|
|
|
|
|
print(f" - {desc}: Failed ({e})")
|
2026-02-06 07:42:45 +08:00
|
|
|
|
|
|
|
|
|
|
# Truncate table
|
|
|
|
|
|
print(f"Truncating table {table}...")
|
|
|
|
|
|
cursor.execute(f"TRUNCATE TABLE `{table}`")
|
|
|
|
|
|
|
|
|
|
|
|
# Find files
|
|
|
|
|
|
files = glob.glob(os.path.join(csv_dir, "*"))
|
|
|
|
|
|
if not files:
|
|
|
|
|
|
print(f"No files found in {csv_dir}")
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
total_rows = 0
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
for file_path in files:
|
|
|
|
|
|
file_path = os.path.abspath(file_path).replace('\\', '/')
|
|
|
|
|
|
print(f"Loading file: {file_path}")
|
|
|
|
|
|
|
|
|
|
|
|
# Build SQL
|
|
|
|
|
|
# Assuming DataX txtfilewriter defaults:
|
|
|
|
|
|
# separator: ,
|
|
|
|
|
|
# quoteChar: "
|
|
|
|
|
|
# escapeChar: \
|
|
|
|
|
|
# nullFormat: \N
|
|
|
|
|
|
|
|
|
|
|
|
col_sql = ""
|
|
|
|
|
|
if columns:
|
|
|
|
|
|
col_list = [f"`{c}`" for c in columns]
|
|
|
|
|
|
col_sql = "(" + ", ".join(col_list) + ")"
|
|
|
|
|
|
|
|
|
|
|
|
sql = f"""
|
|
|
|
|
|
LOAD DATA LOCAL INFILE '{file_path}'
|
|
|
|
|
|
INTO TABLE `{table}`
|
|
|
|
|
|
CHARACTER SET utf8mb4
|
|
|
|
|
|
FIELDS TERMINATED BY ','
|
|
|
|
|
|
OPTIONALLY ENCLOSED BY '"'
|
|
|
|
|
|
ESCAPED BY '\\\\'
|
|
|
|
|
|
LINES TERMINATED BY '\\n'
|
|
|
|
|
|
{col_sql}
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
cursor.execute(sql)
|
|
|
|
|
|
rows = cursor.rowcount
|
|
|
|
|
|
total_rows += rows
|
|
|
|
|
|
print(f" -> Loaded {rows} rows")
|
|
|
|
|
|
|
2026-02-06 08:21:47 +08:00
|
|
|
|
# 显示 MySQL 警告(SHOW WARNINGS)的功能,用于排查导入差异
|
|
|
|
|
|
try:
|
|
|
|
|
|
cursor.execute("SHOW WARNINGS")
|
|
|
|
|
|
warnings = cursor.fetchall()
|
|
|
|
|
|
if warnings:
|
|
|
|
|
|
print(f" - MySQL Warnings ({len(warnings)}):")
|
|
|
|
|
|
# 最多显示前 5 条警告,避免日志过多
|
|
|
|
|
|
for i, warn in enumerate(warnings[:5]):
|
|
|
|
|
|
print(f" - {warn.get('Level', 'Warning')}: {warn.get('Message', 'Unknown error')}")
|
|
|
|
|
|
if len(warnings) > 5:
|
|
|
|
|
|
print(f" - ... and {len(warnings) - 5} more warnings")
|
|
|
|
|
|
except Exception as warn_e:
|
|
|
|
|
|
print(f" - Could not fetch warnings: {warn_e}")
|
|
|
|
|
|
|
2026-02-06 07:42:45 +08:00
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
|
|
|
|
duration = time.time() - start_time
|
|
|
|
|
|
print(f"Total loaded: {total_rows} rows in {duration:.2f}s ({total_rows/duration if duration > 0 else 0:.2f} rows/s)")
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"Error during load: {e}")
|
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
finally:
|
|
|
|
|
|
conn.close()
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
if len(sys.argv) < 6:
|
|
|
|
|
|
print("Usage: python LoadCsvToMysql.py <jdbc_url> <user> <password> <table> <csv_dir> [columns_json]")
|
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
jdbc_url = sys.argv[1]
|
|
|
|
|
|
user = sys.argv[2]
|
|
|
|
|
|
password = sys.argv[3]
|
|
|
|
|
|
table = sys.argv[4]
|
|
|
|
|
|
csv_dir = sys.argv[5]
|
|
|
|
|
|
|
|
|
|
|
|
columns = None
|
|
|
|
|
|
if len(sys.argv) > 6:
|
|
|
|
|
|
try:
|
|
|
|
|
|
columns = json.loads(sys.argv[6])
|
|
|
|
|
|
except:
|
|
|
|
|
|
print("Warning: Could not parse columns JSON")
|
|
|
|
|
|
|
|
|
|
|
|
load_csv(jdbc_url, user, password, table, csv_dir, columns)
|