298 lines
9.8 KiB
Bash
298 lines
9.8 KiB
Bash
#!/bin/bash
|
||
export PATH=$PATH:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin
|
||
[ -f /etc/profile ] && . /etc/profile
|
||
[ -f ~/.bash_profile ] && . ~/.bash_profile
|
||
[ -f ~/.profile ] && . ~/.profile
|
||
|
||
TARGET=$1
|
||
|
||
if [ -z "$TARGET" ]; then
|
||
echo "用法: $0 <target>"
|
||
echo "请指定同步目标:"
|
||
echo " doris - 全量同步到 Doris (默认端口 9030)"
|
||
echo " mysql - 全量同步到 MySQL (CSV Load 模式,高性能)"
|
||
# echo " mysql_jdbc - 全量同步到 MySQL (JDBC Batch 模式,速度较慢)"
|
||
exit 1
|
||
fi
|
||
|
||
DATAX_HOME="/usr/local/datax"
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||
CONF_DIR="$SCRIPT_DIR/json" # 更新为统一目录
|
||
BIN_DIR="$DATAX_HOME/bin"
|
||
DATAX_PY="$BIN_DIR/datax.py"
|
||
TOOL_DIR="$SCRIPT_DIR/tool"
|
||
|
||
JOBS=(
|
||
"t_equipment_charge_order.json"
|
||
"t_equipment_charge_order_detail.json"
|
||
"t_account_recharge.json"
|
||
"t_account_water.json"
|
||
"t_car.json"
|
||
"t_company.json"
|
||
"t_company_info_value.json"
|
||
"t_connector.json"
|
||
"t_equipment.json"
|
||
"t_station.json"
|
||
"t_ext_hurry_quit.json"
|
||
"t_time_day.json"
|
||
"t_user.json"
|
||
"t_user_account.json"
|
||
"t_user_upload_fault.json"
|
||
)
|
||
|
||
# 公共源端参数
|
||
SRC_PARAMS="-Dsrc_user=ylt -Dsrc_pwd=Ycharge666 -Dsrc_jdbc=jdbc:mysql://rm-bp1ux6tuk49er80t9xo.mysql.rds.aliyuncs.com:3306/yltcharge?useSSL=false&useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai"
|
||
|
||
if [ "$TARGET" == "doris" ]; then
|
||
echo "模式: Doris 全量同步"
|
||
DEST_PARAMS="-Ddest_user=root -Ddest_pwd=DsideaL147258369 -Ddest_load_url=10.10.14.204:8030 -Ddest_jdbc=jdbc:mysql://10.10.14.204:9030/yltcharge?useSSL=false"
|
||
PARAMS="$SRC_PARAMS $DEST_PARAMS"
|
||
elif [ "$TARGET" == "mysql" ]; then
|
||
echo "模式: MySQL 全量同步 (CSV Load)"
|
||
# 注意: CSV Load 模式下 DataX 负责写文件,Python 负责 Load。JDBC 参数用于 Python 连接。
|
||
DEST_PARAMS="-Ddest_mysql_user=ylt -Ddest_mysql_pwd=Ycharge666 -Ddest_mysql_jdbc=jdbc:mysql://10.10.14.210:22066/yltcharge?useUnicode=true&characterEncoding=UTF-8&useSSL=false&allowLoadLocalInfile=true"
|
||
PARAMS="$SRC_PARAMS $DEST_PARAMS"
|
||
# elif [ "$TARGET" == "mysql_jdbc" ]; then
|
||
# echo "模式: MySQL 全量同步 (JDBC Batch)"
|
||
# DEST_PARAMS="-Ddest_mysql_user=ylt -Ddest_mysql_pwd=Ycharge666 -Ddest_mysql_jdbc=jdbc:mysql://10.10.14.210:22066/yltcharge?useUnicode=true&characterEncoding=UTF-8&useSSL=false&rewriteBatchedStatements=true&autoReconnect=true&failOverReadOnly=false"
|
||
# PARAMS="$SRC_PARAMS $DEST_PARAMS"
|
||
else
|
||
echo "错误: 未知目标 '$TARGET'。请使用 'doris' 或 'mysql'。"
|
||
exit 1
|
||
fi
|
||
|
||
SCRIPT_START_TIME=$(date '+%Y-%m-%d %H:%M:%S')
|
||
SCRIPT_START_TIMESTAMP=$(date +%s)
|
||
|
||
echo "====================================="
|
||
echo "DataX 全量同步脚本 (Target: $TARGET)"
|
||
echo "====================================="
|
||
echo "配置文件目录: $CONF_DIR"
|
||
echo "任务数量: ${#JOBS[@]}"
|
||
echo "脚本开始时间: $SCRIPT_START_TIME"
|
||
echo "====================================="
|
||
|
||
cd "$CONF_DIR" || { echo "错误: 无法进入配置文件目录 $CONF_DIR"; exit 1; }
|
||
|
||
SUCCESS_COUNT=0
|
||
FAIL_COUNT=0
|
||
TOTAL=${#JOBS[@]}
|
||
CURRENT=0
|
||
|
||
# 清理旧的 CSV 临时目录
|
||
if [ "$TARGET" == "mysql" ]; then
|
||
rm -rf "$SCRIPT_DIR/datax_tmp_csv"
|
||
fi
|
||
|
||
for JOB in "${JOBS[@]}"; do
|
||
CURRENT=$((CURRENT + 1))
|
||
echo "任务 [$CURRENT/$TOTAL] $JOB"
|
||
echo "----------------------------------------"
|
||
|
||
if [ ! -f "$JOB" ]; then
|
||
echo "✗ 错误: 文件不存在 - $JOB"
|
||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||
continue
|
||
fi
|
||
|
||
START_TIME=$(date +%s)
|
||
JOB_FILE="$JOB"
|
||
|
||
# 变量用于存储 mysql_load 模式下的元数据
|
||
TABLE_NAME=""
|
||
COLUMNS_JSON=""
|
||
CSV_DIR=""
|
||
|
||
# ---------------------------------------------------------
|
||
# 配置转换逻辑
|
||
# ---------------------------------------------------------
|
||
|
||
TMP_FILE="/tmp/datax_optimized_${TARGET}_${JOB}"
|
||
|
||
# 传递目标类型和 CSV 输出目录给 Python
|
||
export DATAX_TARGET="$TARGET"
|
||
if [ "$TARGET" == "mysql" ]; then
|
||
CSV_DIR="$SCRIPT_DIR/datax_tmp_csv/${JOB%.json}"
|
||
export DATAX_CSV_DIR="$CSV_DIR"
|
||
fi
|
||
|
||
# 执行 Python 转换脚本并捕获输出
|
||
TRANSFORM_OUTPUT=$(/usr/bin/python - <<'PY' "$JOB" "$TMP_FILE"
|
||
import json, sys, re, os
|
||
|
||
src_path = sys.argv[1]
|
||
dst_path = sys.argv[2]
|
||
target = os.environ.get('DATAX_TARGET', 'doris')
|
||
csv_dir = os.environ.get('DATAX_CSV_DIR', '')
|
||
|
||
with open(src_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
# 1. 基础性能优化 (设置并发和内存限制)
|
||
if 'setting' not in data['job']:
|
||
data['job']['setting'] = {}
|
||
data['job']['setting']['speed'] = {
|
||
"channel": 8, # 提高并发到 8,加速 Reader 读取
|
||
}
|
||
data['job']['setting']['errorLimit'] = {
|
||
"record": 0
|
||
}
|
||
|
||
unit = data['job']['content'][0]
|
||
reader = unit.get('reader', {})
|
||
writer = unit.get('writer', {})
|
||
rp = reader.get('parameter', {})
|
||
wp = writer.get('parameter', {})
|
||
|
||
# 2. Reader 优化 (FetchSize)
|
||
rp['fetchSize'] = 10000 # 极大增加读取缓存,减少网络往返
|
||
|
||
# 获取表名
|
||
table_name = None
|
||
try:
|
||
# 优先尝试从 writer.parameter.connection[0].table[0] 获取
|
||
table_name = wp.get('connection', [{}])[0].get('table', [None])[0]
|
||
except Exception:
|
||
pass
|
||
if not table_name:
|
||
try:
|
||
# 其次尝试从 reader.parameter.connection[0].table[0] 获取
|
||
table_name = rp.get('connection', [{}])[0].get('table', [None])[0]
|
||
except Exception:
|
||
pass
|
||
if not table_name:
|
||
# 最后尝试从 querySql 解析
|
||
q = rp.get('connection', [{}])[0].get('querySql', [None])[0]
|
||
if q:
|
||
m = re.search(r'FROM\s+`?(\w+)`?', q, re.IGNORECASE)
|
||
if m:
|
||
table_name = m.group(1)
|
||
|
||
if not table_name:
|
||
# 如果还是没找到,使用文件名去掉 .json
|
||
table_name = os.path.basename(src_path).replace('.json', '')
|
||
|
||
# 获取列
|
||
columns = wp.get('column') or rp.get('column')
|
||
if not columns:
|
||
# 默认使用 * (虽然 DataX 不推荐,但作为兜底)
|
||
columns = ["*"]
|
||
|
||
# 3. 根据目标转换 Writer
|
||
if target == 'mysql':
|
||
if not os.path.exists(csv_dir):
|
||
os.makedirs(csv_dir)
|
||
|
||
new_writer = {
|
||
"name": "txtfilewriter",
|
||
"parameter": {
|
||
"path": csv_dir,
|
||
"fileName": table_name,
|
||
"writeMode": "truncate",
|
||
"fileFormat": "csv",
|
||
"separator": ",",
|
||
"quoteChar": "\"",
|
||
"escapeChar": "\\",
|
||
"nullFormat": "\\N",
|
||
"header": [],
|
||
"column": columns
|
||
}
|
||
}
|
||
unit['writer'] = new_writer
|
||
|
||
# 输出元数据供 Shell 使用
|
||
print(table_name)
|
||
print(json.dumps(columns))
|
||
|
||
with open(dst_path, 'w', encoding='utf-8') as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=4)
|
||
PY
|
||
)
|
||
|
||
if [ $? -ne 0 ]; then
|
||
echo "✗ 错误: 生成配置文件失败 - $JOB"
|
||
echo "$TRANSFORM_OUTPUT"
|
||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||
continue
|
||
fi
|
||
|
||
JOB_FILE="$TMP_FILE"
|
||
|
||
# 解析元数据
|
||
TABLE_NAME=$(echo "$TRANSFORM_OUTPUT" | sed -n '1p' | tr -d '\r')
|
||
COLUMNS_JSON=$(echo "$TRANSFORM_OUTPUT" | sed -n '2p' | tr -d '\r')
|
||
|
||
# ---------------------------------------------------------
|
||
# 执行 DataX
|
||
# ---------------------------------------------------------
|
||
|
||
# 构造 Java 命令
|
||
CLASS_PATH="$DATAX_HOME/lib/*:$DATAX_HOME/conf:."
|
||
# 优化 JVM 参数: 增大内存,使用 G1 回收器提高吞吐,增加 Metaspace 空间
|
||
JVM_OPTS="-server -Xms2g -Xmx2g -XX:MaxMetaspaceSize=256m -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:+HeapDumpOnOutOfMemoryError"
|
||
|
||
java $JVM_OPTS -classpath "$CLASS_PATH" \
|
||
-Dfile.encoding=UTF-8 \
|
||
-Dlogback.statusListenerClass=ch.qos.logback.core.status.NopStatusListener \
|
||
-Djava.security.egd=file:///dev/urandom \
|
||
-Ddatax.home="$DATAX_HOME" \
|
||
-Dlogback.configurationFile="$DATAX_HOME/conf/logback.xml" \
|
||
$PARAMS \
|
||
com.alibaba.datax.core.Engine \
|
||
-mode standalone \
|
||
-jobid -1 \
|
||
-job "$JOB_FILE" | sed "s/^/[$TABLE_NAME] /"
|
||
|
||
EXIT_CODE=$?
|
||
|
||
# ---------------------------------------------------------
|
||
# 后置处理: mysql 模式下执行导入
|
||
# ---------------------------------------------------------
|
||
|
||
if [ $EXIT_CODE -eq 0 ] && [ "$TARGET" == "mysql" ]; then
|
||
echo "DataX 导出完成,开始执行 MySQL Load Data..."
|
||
|
||
# 提取连接参数
|
||
# 注意: 这里简化处理,直接写死或从 PARAMS 中解析有点麻烦
|
||
# 我们直接复用脚本顶部的变量,但要注意这些变量包含 -D 前缀
|
||
# 所以最好直接传递硬编码的值或者重新定义变量
|
||
|
||
# 解析参数 (简单暴力去除 -Ddest_mysql_... 前缀)
|
||
# 这里为了稳健,我们直接使用 Python 脚本解析 JDBC URL
|
||
|
||
# 定义 DB 连接信息 (与 DEST_PARAMS 保持一致)
|
||
DB_USER="ylt"
|
||
DB_PWD="Ycharge666"
|
||
DB_JDBC="jdbc:mysql://10.10.14.210:22066/yltcharge?useUnicode=true&characterEncoding=UTF-8&useSSL=false&allowLoadLocalInfile=true"
|
||
|
||
python "$TOOL_DIR/LoadCsvToMysql.py" \
|
||
"$DB_JDBC" \
|
||
"$DB_USER" \
|
||
"$DB_PWD" \
|
||
"$TABLE_NAME" \
|
||
"$CSV_DIR" \
|
||
"$COLUMNS_JSON"
|
||
|
||
EXIT_CODE=$?
|
||
|
||
# 清理临时 CSV
|
||
rm -rf "$CSV_DIR"
|
||
fi
|
||
|
||
END_TIME=$(date +%s)
|
||
DURATION=$((END_TIME - START_TIME))
|
||
|
||
if [ $EXIT_CODE -eq 0 ]; then
|
||
echo "✓ 成功: $JOB 用时 ${DURATION}s"
|
||
SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
|
||
else
|
||
echo "✗ 失败: $JOB 用时 ${DURATION}s (exit=$EXIT_CODE)"
|
||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||
fi
|
||
echo "----------------------------------------"
|
||
done
|
||
|
||
TOTAL_TIME=$(( $(date +%s) - SCRIPT_START_TIMESTAMP ))
|
||
echo "完成: 成功 $SUCCESS_COUNT, 失败 $FAIL_COUNT, 总耗时 ${TOTAL_TIME}s"
|
||
exit $([ $FAIL_COUNT -eq 0 ] && echo 0 || echo 1)
|