feat: add --sync to export BQ tables directly to S3 without GCS intermediary
This commit is contained in:
63
roda.sh
63
roda.sh
@@ -31,10 +31,13 @@ fi
|
||||
|
||||
DRY_RUN=false
|
||||
GCLOUD_RUN=false
|
||||
SYNC_RUN=false
|
||||
if [[ "${1:-}" == "--dry-run" ]]; then
|
||||
DRY_RUN=true
|
||||
elif [[ "${1:-}" == "--gcloud-run" ]]; then
|
||||
GCLOUD_RUN=true
|
||||
elif [[ "${1:-}" == "--sync" ]]; then
|
||||
SYNC_RUN=true
|
||||
fi
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
@@ -74,6 +77,18 @@ if [[ -z "${AWS_ACCESS_KEY_ID:-}" || -z "${AWS_SECRET_ACCESS_KEY:-}" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Validate GCP project (needed for --sync)
|
||||
if [[ -z "${GCP_PROJECT:-}" ]]; then
|
||||
if $SYNC_RUN; then
|
||||
if [[ -z "${YOUR_PROJECT:-}" ]]; then
|
||||
log_err "GCP_PROJECT não encontrado no .env. Adicione GCP_PROJECT ou YOUR_PROJECT."
|
||||
exit 1
|
||||
fi
|
||||
log "GCP_PROJECT not set, using YOUR_PROJECT: $YOUR_PROJECT"
|
||||
export GCP_PROJECT="$YOUR_PROJECT"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Configure rclone remotes via env vars — no rclone.conf or inline credentials needed.
|
||||
# GCS remote (bd:) uses Application Default Credentials from gcloud auth application-default login.
|
||||
# Hetzner S3 remote (hz:) uses the credentials from .env, kept out of the process command line.
|
||||
@@ -182,6 +197,54 @@ REMOTE_SETUP
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# =============================================================================
|
||||
# SYNC — BigQuery → S3 direct (no GCS intermediary)
|
||||
# =============================================================================
|
||||
if $SYNC_RUN; then
|
||||
log "=============================="
|
||||
log " SYNC MODE — BigQuery → S3"
|
||||
log "=============================="
|
||||
|
||||
# Check dependencies
|
||||
for cmd in python3; do
|
||||
if ! command -v "$cmd" &>/dev/null; then
|
||||
log_err "'$cmd' not found."
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# Check Python dependencies (import name vs pip package name differs)
|
||||
PYTHON_CHECKS="google.cloud.bigquery:boto3:pandas:pyarrow"
|
||||
for check in $(echo "$PYTHON_CHECKS" | tr ':' '\n'); do
|
||||
module="${check}"
|
||||
if ! python3 -c "import ${module}" 2>/dev/null; then
|
||||
pip_pkg="${module}"
|
||||
log_err "Missing Python package: ${pip_pkg}. Run: pip install google-cloud-bigquery boto3 pandas pyarrow"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# Set GCP_PROJECT for the Python script
|
||||
export GCP_PROJECT="${GCP_PROJECT:-${YOUR_PROJECT}}"
|
||||
|
||||
log "GCP project: $GCP_PROJECT"
|
||||
log "S3 bucket: $HETZNER_S3_BUCKET"
|
||||
log "S3 endpoint: $HETZNER_S3_ENDPOINT"
|
||||
log ""
|
||||
|
||||
if $DRY_RUN; then
|
||||
log "DRY RUN — listing tables only, no data will be transferred"
|
||||
fi
|
||||
|
||||
# Run the sync script, filtering out --sync (roda.sh flag)
|
||||
SYNC_ARGS=()
|
||||
for arg in "$@"; do
|
||||
[[ "$arg" != "--sync" ]] && SYNC_ARGS+=("$arg")
|
||||
done
|
||||
python3 sync_bq_to_local.py "${SYNC_ARGS[@]+"${SYNC_ARGS[@]}"}"
|
||||
exit $?
|
||||
fi
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# STEP 1 — Create GCS bucket in US region (same as basedosdados)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user