Files
aiData/DataX/tool/LoadCsvToMysql.py
HuangHai e08b7af675 'commit'
2026-02-06 07:42:45 +08:00

144 lines
4.3 KiB
Python

import pymysql
import sys
import os
import glob
import json
import time
from urllib.parse import urlparse, parse_qs
def parse_jdbc_url(url):
# jdbc:mysql://host:port/db?params
# remove jdbc:mysql://
if url.startswith("jdbc:mysql://"):
url = url[13:]
# split host:port and db
if "/" in url:
address, remainder = url.split("/", 1)
if "?" in remainder:
db, params = remainder.split("?", 1)
else:
db = remainder
params = ""
else:
address = url
db = ""
params = ""
if ":" in address:
host, port = address.split(":")
port = int(port)
else:
host = address
port = 3306
return host, port, db
def load_csv(jdbc_url, user, password, table, csv_dir, columns=None):
host, port, db = parse_jdbc_url(jdbc_url)
print(f"Connecting to MySQL {host}:{port}/{db} as {user}...")
try:
conn = pymysql.connect(
host=host,
port=port,
user=user,
password=password,
database=db,
local_infile=True,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
except Exception as e:
print(f"Connection failed: {e}")
sys.exit(1)
try:
with conn.cursor() as cursor:
# Optimization settings
print("Setting session parameters for speed...")
cursor.execute("SET NAMES utf8mb4")
try:
cursor.execute("SET FOREIGN_KEY_CHECKS = 0")
cursor.execute("SET UNIQUE_CHECKS = 0")
cursor.execute("SET SQL_LOG_BIN = 0")
except Exception as e:
print(f"Warning: Could not set some optimization flags: {e}")
# Truncate table
print(f"Truncating table {table}...")
cursor.execute(f"TRUNCATE TABLE `{table}`")
# Find files
files = glob.glob(os.path.join(csv_dir, "*"))
if not files:
print(f"No files found in {csv_dir}")
return
total_rows = 0
start_time = time.time()
for file_path in files:
file_path = os.path.abspath(file_path).replace('\\', '/')
print(f"Loading file: {file_path}")
# Build SQL
# Assuming DataX txtfilewriter defaults:
# separator: ,
# quoteChar: "
# escapeChar: \
# nullFormat: \N
col_sql = ""
if columns:
col_list = [f"`{c}`" for c in columns]
col_sql = "(" + ", ".join(col_list) + ")"
sql = f"""
LOAD DATA LOCAL INFILE '{file_path}'
INTO TABLE `{table}`
CHARACTER SET utf8mb4
FIELDS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"'
ESCAPED BY '\\\\'
LINES TERMINATED BY '\\n'
{col_sql}
"""
cursor.execute(sql)
rows = cursor.rowcount
total_rows += rows
print(f" -> Loaded {rows} rows")
conn.commit()
duration = time.time() - start_time
print(f"Total loaded: {total_rows} rows in {duration:.2f}s ({total_rows/duration if duration > 0 else 0:.2f} rows/s)")
except Exception as e:
print(f"Error during load: {e}")
sys.exit(1)
finally:
conn.close()
if __name__ == "__main__":
if len(sys.argv) < 6:
print("Usage: python LoadCsvToMysql.py <jdbc_url> <user> <password> <table> <csv_dir> [columns_json]")
sys.exit(1)
jdbc_url = sys.argv[1]
user = sys.argv[2]
password = sys.argv[3]
table = sys.argv[4]
csv_dir = sys.argv[5]
columns = None
if len(sys.argv) > 6:
try:
columns = json.loads(sys.argv[6])
except:
print("Warning: Could not parse columns JSON")
load_csv(jdbc_url, user, password, table, csv_dir, columns)