feat: add --sync to export BQ tables directly to S3 without GCS intermediary

2026-03-29 17:39:13 +02:00
parent 43e5ae6723
commit 36acd1320c
2 changed files with 606 additions and 0 deletions
--- a/roda.sh
+++ b/roda.sh
@@ -31,10 +31,13 @@ fi

 DRY_RUN=false
 GCLOUD_RUN=false
+SYNC_RUN=false
 if [[ "${1:-}" == "--dry-run" ]]; then
  DRY_RUN=true
 elif [[ "${1:-}" == "--gcloud-run" ]]; then
  GCLOUD_RUN=true
+elif [[ "${1:-}" == "--sync" ]]; then
+  SYNC_RUN=true
 fi

 # -----------------------------------------------------------------------------
@@ -74,6 +77,18 @@ if [[ -z "${AWS_ACCESS_KEY_ID:-}" || -z "${AWS_SECRET_ACCESS_KEY:-}" ]]; then
  exit 1
 fi

+# Validate GCP project (needed for --sync)
+if [[ -z "${GCP_PROJECT:-}" ]]; then
+  if $SYNC_RUN; then
+    if [[ -z "${YOUR_PROJECT:-}" ]]; then
+      log_err "GCP_PROJECT não encontrado no .env. Adicione GCP_PROJECT ou YOUR_PROJECT."
+      exit 1
+    fi
+    log "GCP_PROJECT not set, using YOUR_PROJECT: $YOUR_PROJECT"
+    export GCP_PROJECT="$YOUR_PROJECT"
+  fi
+fi
+
 # Configure rclone remotes via env vars — no rclone.conf or inline credentials needed.
 # GCS remote (bd:) uses Application Default Credentials from gcloud auth application-default login.
 # Hetzner S3 remote (hz:) uses the credentials from .env, kept out of the process command line.
@@ -182,6 +197,54 @@ REMOTE_SETUP
  exit 0
 fi

+# =============================================================================
+# SYNC — BigQuery → S3 direct (no GCS intermediary)
+# =============================================================================
+if $SYNC_RUN; then
+  log "=============================="
+  log " SYNC MODE — BigQuery → S3"
+  log "=============================="
+
+  # Check dependencies
+  for cmd in python3; do
+    if ! command -v "$cmd" &>/dev/null; then
+      log_err "'$cmd' not found."
+      exit 1
+    fi
+  done
+
+  # Check Python dependencies (import name vs pip package name differs)
+  PYTHON_CHECKS="google.cloud.bigquery:boto3:pandas:pyarrow"
+  for check in $(echo "$PYTHON_CHECKS" | tr ':' '\n'); do
+    module="${check}"
+    if ! python3 -c "import ${module}" 2>/dev/null; then
+      pip_pkg="${module}"
+      log_err "Missing Python package: ${pip_pkg}. Run: pip install google-cloud-bigquery boto3 pandas pyarrow"
+      exit 1
+    fi
+  done
+
+  # Set GCP_PROJECT for the Python script
+  export GCP_PROJECT="${GCP_PROJECT:-${YOUR_PROJECT}}"
+
+  log "GCP project: $GCP_PROJECT"
+  log "S3 bucket:   $HETZNER_S3_BUCKET"
+  log "S3 endpoint: $HETZNER_S3_ENDPOINT"
+  log ""
+
+  if $DRY_RUN; then
+    log "DRY RUN — listing tables only, no data will be transferred"
+  fi
+
+  # Run the sync script, filtering out --sync (roda.sh flag)
+  SYNC_ARGS=()
+  for arg in "$@"; do
+    [[ "$arg" != "--sync" ]] && SYNC_ARGS+=("$arg")
+  done
+  python3 sync_bq_to_local.py "${SYNC_ARGS[@]+"${SYNC_ARGS[@]}"}"
+  exit $?
+fi
+
 # -----------------------------------------------------------------------------
 # STEP 1 — Create GCS bucket in US region (same as basedosdados)
 # -----------------------------------------------------------------------------