Files
aiData/DataX/run_jobs_full.sh
HuangHai 1f4d29d5a1 'commit'
2026-02-06 08:21:47 +08:00

298 lines
9.8 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
export PATH=$PATH:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin
[ -f /etc/profile ] && . /etc/profile
[ -f ~/.bash_profile ] && . ~/.bash_profile
[ -f ~/.profile ] && . ~/.profile
TARGET=$1
if [ -z "$TARGET" ]; then
echo "用法: $0 <target>"
echo "请指定同步目标:"
echo " doris - 全量同步到 Doris (默认端口 9030)"
echo " mysql - 全量同步到 MySQL (CSV Load 模式,高性能)"
# echo " mysql_jdbc - 全量同步到 MySQL (JDBC Batch 模式,速度较慢)"
exit 1
fi
DATAX_HOME="/usr/local/datax"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
CONF_DIR="$SCRIPT_DIR/json" # 更新为统一目录
BIN_DIR="$DATAX_HOME/bin"
DATAX_PY="$BIN_DIR/datax.py"
TOOL_DIR="$SCRIPT_DIR/tool"
JOBS=(
"t_equipment_charge_order.json"
"t_equipment_charge_order_detail.json"
"t_account_recharge.json"
"t_account_water.json"
"t_car.json"
"t_company.json"
"t_company_info_value.json"
"t_connector.json"
"t_equipment.json"
"t_station.json"
"t_ext_hurry_quit.json"
"t_time_day.json"
"t_user.json"
"t_user_account.json"
"t_user_upload_fault.json"
)
# 公共源端参数
SRC_PARAMS="-Dsrc_user=ylt -Dsrc_pwd=Ycharge666 -Dsrc_jdbc=jdbc:mysql://rm-bp1ux6tuk49er80t9xo.mysql.rds.aliyuncs.com:3306/yltcharge?useSSL=false&useUnicode=true&characterEncoding=UTF-8&serverTimezone=Asia/Shanghai"
if [ "$TARGET" == "doris" ]; then
echo "模式: Doris 全量同步"
DEST_PARAMS="-Ddest_user=root -Ddest_pwd=DsideaL147258369 -Ddest_load_url=10.10.14.204:8030 -Ddest_jdbc=jdbc:mysql://10.10.14.204:9030/yltcharge?useSSL=false"
PARAMS="$SRC_PARAMS $DEST_PARAMS"
elif [ "$TARGET" == "mysql" ]; then
echo "模式: MySQL 全量同步 (CSV Load)"
# 注意: CSV Load 模式下 DataX 负责写文件Python 负责 Load。JDBC 参数用于 Python 连接。
DEST_PARAMS="-Ddest_mysql_user=ylt -Ddest_mysql_pwd=Ycharge666 -Ddest_mysql_jdbc=jdbc:mysql://10.10.14.210:22066/yltcharge?useUnicode=true&characterEncoding=UTF-8&useSSL=false&allowLoadLocalInfile=true"
PARAMS="$SRC_PARAMS $DEST_PARAMS"
# elif [ "$TARGET" == "mysql_jdbc" ]; then
# echo "模式: MySQL 全量同步 (JDBC Batch)"
# DEST_PARAMS="-Ddest_mysql_user=ylt -Ddest_mysql_pwd=Ycharge666 -Ddest_mysql_jdbc=jdbc:mysql://10.10.14.210:22066/yltcharge?useUnicode=true&characterEncoding=UTF-8&useSSL=false&rewriteBatchedStatements=true&autoReconnect=true&failOverReadOnly=false"
# PARAMS="$SRC_PARAMS $DEST_PARAMS"
else
echo "错误: 未知目标 '$TARGET'。请使用 'doris' 或 'mysql'。"
exit 1
fi
SCRIPT_START_TIME=$(date '+%Y-%m-%d %H:%M:%S')
SCRIPT_START_TIMESTAMP=$(date +%s)
echo "====================================="
echo "DataX 全量同步脚本 (Target: $TARGET)"
echo "====================================="
echo "配置文件目录: $CONF_DIR"
echo "任务数量: ${#JOBS[@]}"
echo "脚本开始时间: $SCRIPT_START_TIME"
echo "====================================="
cd "$CONF_DIR" || { echo "错误: 无法进入配置文件目录 $CONF_DIR"; exit 1; }
SUCCESS_COUNT=0
FAIL_COUNT=0
TOTAL=${#JOBS[@]}
CURRENT=0
# 清理旧的 CSV 临时目录
if [ "$TARGET" == "mysql" ]; then
rm -rf "$SCRIPT_DIR/datax_tmp_csv"
fi
for JOB in "${JOBS[@]}"; do
CURRENT=$((CURRENT + 1))
echo "任务 [$CURRENT/$TOTAL] $JOB"
echo "----------------------------------------"
if [ ! -f "$JOB" ]; then
echo "✗ 错误: 文件不存在 - $JOB"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
START_TIME=$(date +%s)
JOB_FILE="$JOB"
# 变量用于存储 mysql_load 模式下的元数据
TABLE_NAME=""
COLUMNS_JSON=""
CSV_DIR=""
# ---------------------------------------------------------
# 配置转换逻辑
# ---------------------------------------------------------
TMP_FILE="/tmp/datax_optimized_${TARGET}_${JOB}"
# 传递目标类型和 CSV 输出目录给 Python
export DATAX_TARGET="$TARGET"
if [ "$TARGET" == "mysql" ]; then
CSV_DIR="$SCRIPT_DIR/datax_tmp_csv/${JOB%.json}"
export DATAX_CSV_DIR="$CSV_DIR"
fi
# 执行 Python 转换脚本并捕获输出
TRANSFORM_OUTPUT=$(/usr/bin/python - <<'PY' "$JOB" "$TMP_FILE"
import json, sys, re, os
src_path = sys.argv[1]
dst_path = sys.argv[2]
target = os.environ.get('DATAX_TARGET', 'doris')
csv_dir = os.environ.get('DATAX_CSV_DIR', '')
with open(src_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# 1. 基础性能优化 (设置并发和内存限制)
if 'setting' not in data['job']:
data['job']['setting'] = {}
data['job']['setting']['speed'] = {
"channel": 8, # 提高并发到 8加速 Reader 读取
}
data['job']['setting']['errorLimit'] = {
"record": 0
}
unit = data['job']['content'][0]
reader = unit.get('reader', {})
writer = unit.get('writer', {})
rp = reader.get('parameter', {})
wp = writer.get('parameter', {})
# 2. Reader 优化 (FetchSize)
rp['fetchSize'] = 10000 # 极大增加读取缓存,减少网络往返
# 获取表名
table_name = None
try:
# 优先尝试从 writer.parameter.connection[0].table[0] 获取
table_name = wp.get('connection', [{}])[0].get('table', [None])[0]
except Exception:
pass
if not table_name:
try:
# 其次尝试从 reader.parameter.connection[0].table[0] 获取
table_name = rp.get('connection', [{}])[0].get('table', [None])[0]
except Exception:
pass
if not table_name:
# 最后尝试从 querySql 解析
q = rp.get('connection', [{}])[0].get('querySql', [None])[0]
if q:
m = re.search(r'FROM\s+`?(\w+)`?', q, re.IGNORECASE)
if m:
table_name = m.group(1)
if not table_name:
# 如果还是没找到,使用文件名去掉 .json
table_name = os.path.basename(src_path).replace('.json', '')
# 获取列
columns = wp.get('column') or rp.get('column')
if not columns:
# 默认使用 * (虽然 DataX 不推荐,但作为兜底)
columns = ["*"]
# 3. 根据目标转换 Writer
if target == 'mysql':
if not os.path.exists(csv_dir):
os.makedirs(csv_dir)
new_writer = {
"name": "txtfilewriter",
"parameter": {
"path": csv_dir,
"fileName": table_name,
"writeMode": "truncate",
"fileFormat": "csv",
"separator": ",",
"quoteChar": "\"",
"escapeChar": "\\",
"nullFormat": "\\N",
"header": [],
"column": columns
}
}
unit['writer'] = new_writer
# 输出元数据供 Shell 使用
print(table_name)
print(json.dumps(columns))
with open(dst_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
PY
)
if [ $? -ne 0 ]; then
echo "✗ 错误: 生成配置文件失败 - $JOB"
echo "$TRANSFORM_OUTPUT"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
JOB_FILE="$TMP_FILE"
# 解析元数据
TABLE_NAME=$(echo "$TRANSFORM_OUTPUT" | sed -n '1p' | tr -d '\r')
COLUMNS_JSON=$(echo "$TRANSFORM_OUTPUT" | sed -n '2p' | tr -d '\r')
# ---------------------------------------------------------
# 执行 DataX
# ---------------------------------------------------------
# 构造 Java 命令
CLASS_PATH="$DATAX_HOME/lib/*:$DATAX_HOME/conf:."
# 优化 JVM 参数: 增大内存,使用 G1 回收器提高吞吐,增加 Metaspace 空间
JVM_OPTS="-server -Xms2g -Xmx2g -XX:MaxMetaspaceSize=256m -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:+HeapDumpOnOutOfMemoryError"
java $JVM_OPTS -classpath "$CLASS_PATH" \
-Dfile.encoding=UTF-8 \
-Dlogback.statusListenerClass=ch.qos.logback.core.status.NopStatusListener \
-Djava.security.egd=file:///dev/urandom \
-Ddatax.home="$DATAX_HOME" \
-Dlogback.configurationFile="$DATAX_HOME/conf/logback.xml" \
$PARAMS \
com.alibaba.datax.core.Engine \
-mode standalone \
-jobid -1 \
-job "$JOB_FILE" | sed "s/^/[$TABLE_NAME] /"
EXIT_CODE=$?
# ---------------------------------------------------------
# 后置处理: mysql 模式下执行导入
# ---------------------------------------------------------
if [ $EXIT_CODE -eq 0 ] && [ "$TARGET" == "mysql" ]; then
echo "DataX 导出完成,开始执行 MySQL Load Data..."
# 提取连接参数
# 注意: 这里简化处理,直接写死或从 PARAMS 中解析有点麻烦
# 我们直接复用脚本顶部的变量,但要注意这些变量包含 -D 前缀
# 所以最好直接传递硬编码的值或者重新定义变量
# 解析参数 (简单暴力去除 -Ddest_mysql_... 前缀)
# 这里为了稳健,我们直接使用 Python 脚本解析 JDBC URL
# 定义 DB 连接信息 (与 DEST_PARAMS 保持一致)
DB_USER="ylt"
DB_PWD="Ycharge666"
DB_JDBC="jdbc:mysql://10.10.14.210:22066/yltcharge?useUnicode=true&characterEncoding=UTF-8&useSSL=false&allowLoadLocalInfile=true"
python "$TOOL_DIR/LoadCsvToMysql.py" \
"$DB_JDBC" \
"$DB_USER" \
"$DB_PWD" \
"$TABLE_NAME" \
"$CSV_DIR" \
"$COLUMNS_JSON"
EXIT_CODE=$?
# 清理临时 CSV
rm -rf "$CSV_DIR"
fi
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
if [ $EXIT_CODE -eq 0 ]; then
echo "✓ 成功: $JOB 用时 ${DURATION}s"
SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
else
echo "✗ 失败: $JOB 用时 ${DURATION}s (exit=$EXIT_CODE)"
FAIL_COUNT=$((FAIL_COUNT + 1))
fi
echo "----------------------------------------"
done
TOTAL_TIME=$(( $(date +%s) - SCRIPT_START_TIMESTAMP ))
echo "完成: 成功 $SUCCESS_COUNT, 失败 $FAIL_COUNT, 总耗时 ${TOTAL_TIME}s"
exit $([ $FAIL_COUNT -eq 0 ] && echo 0 || echo 1)