#!/usr/bin/env bash # ============================================================================= # export_basedosdados.sh # Exports all basedosdados BigQuery tables → GCS (Parquet+zstd) → Hetzner Object Storage # # Prerequisites (run once before this script): # gcloud auth login # gcloud auth application-default login # gcloud config set project YOUR_PROJECT_ID # cp .env.example .env # then fill in your values # # Usage: # chmod +x export_basedosdados.sh # ./export_basedosdados.sh # full run (locally) # ./export_basedosdados.sh --dry-run # list tables + estimated sizes, no export # ./export_basedosdados.sh --gcloud-run # create GCP VM → run there → delete VM # ============================================================================= set -euo pipefail # Add util-linux to PATH on macOS (provides flock) [[ -d "/opt/homebrew/opt/util-linux/bin" ]] && export PATH="/opt/homebrew/opt/util-linux/bin:$PATH" # Load .env if present if [[ -f "$(dirname "$0")/.env" ]]; then set -a # shellcheck source=.env source "$(dirname "$0")/.env" set +a fi DRY_RUN=false GCLOUD_RUN=false if [[ "${1:-}" == "--dry-run" ]]; then DRY_RUN=true elif [[ "${1:-}" == "--gcloud-run" ]]; then GCLOUD_RUN=true fi # ----------------------------------------------------------------------------- # LOGGING # ----------------------------------------------------------------------------- LOG_FILE="export_$(date +%Y%m%d_%H%M%S).log" FAILED_FILE="failed_tables.txt" DONE_FILE="done_tables.txt" DONE_TRANSFERS_FILE="done_transfers.txt" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; } log_err() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" | tee -a "$LOG_FILE" >&2; } # ----------------------------------------------------------------------------- # STEP 0 — Verify dependencies # ----------------------------------------------------------------------------- log "Checking dependencies..." if $GCLOUD_RUN; then for cmd in gcloud; do if ! command -v "$cmd" &>/dev/null; then log_err "'$cmd' not found. Install google-cloud-sdk." exit 1 fi done else for cmd in bq gcloud gsutil parallel rclone flock; do if ! command -v "$cmd" &>/dev/null; then log_err "'$cmd' not found. Install google-cloud-sdk, GNU parallel, and rclone." exit 1 fi done fi # Validate S3 credentials if [[ -z "${AWS_ACCESS_KEY_ID:-}" || -z "${AWS_SECRET_ACCESS_KEY:-}" ]]; then log_err "Credenciais S3 não encontradas. Preencha o .env com AWS_ACCESS_KEY_ID e AWS_SECRET_ACCESS_KEY." exit 1 fi # Configure rclone remotes via env vars — no rclone.conf or inline credentials needed. # GCS remote (bd:) uses Application Default Credentials from gcloud auth application-default login. # Hetzner S3 remote (hz:) uses the credentials from .env, kept out of the process command line. export RCLONE_CONFIG_BD_TYPE="google cloud storage" export RCLONE_CONFIG_BD_BUCKET_POLICY_ONLY="true" export RCLONE_CONFIG_HZ_TYPE="s3" export RCLONE_CONFIG_HZ_PROVIDER="Other" export RCLONE_CONFIG_HZ_ENDPOINT="$HETZNER_S3_ENDPOINT" export RCLONE_CONFIG_HZ_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" export RCLONE_CONFIG_HZ_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" # ============================================================================= # GCLOUD RUN — create a Compute Engine VM, run the export there, then clean up # ============================================================================= if $GCLOUD_RUN; then VM_NAME="${GCP_VM_NAME:-bd-export-vm}" VM_ZONE="${GCP_VM_ZONE:-us-central1-a}" SCRIPT_PATH="$(realpath "$0")" ENV_PATH="$(dirname "$SCRIPT_PATH")/.env" log "==============================" log " GCLOUD RUN MODE" log "==============================" # ── Step 1/4: Create instance ─────────────────────────────────────────── log "[1/4] Creating VM: $VM_NAME ($VM_ZONE) ..." if gcloud compute instances describe "$VM_NAME" \ --zone="$VM_ZONE" --project="$YOUR_PROJECT" &>/dev/null; then log " VM already exists, reusing it." else gcloud compute instances create "$VM_NAME" \ --project="$YOUR_PROJECT" \ --zone="$VM_ZONE" \ --machine-type=e2-standard-4 \ --image-family=debian-12 \ --image-project=debian-cloud \ --boot-disk-size=20GB \ --scopes=cloud-platform log " VM created." fi # ── Step 2/4: Wait for SSH + copy files ──────────────────────────────── log "[2/4] Waiting for SSH and copying files..." for i in {1..18}; do if gcloud compute ssh "$VM_NAME" \ --zone="$VM_ZONE" --project="$YOUR_PROJECT" \ --command="echo ready" 2>/dev/null; then break fi log " SSH not ready yet ($i/18), retrying in 10s..." sleep 10 done gcloud compute scp "$SCRIPT_PATH" "$ENV_PATH" \ "$VM_NAME:~/" \ --zone="$VM_ZONE" \ --project="$YOUR_PROJECT" log " Files copied." # ── Step 3/4: Install dependencies ───────────────────────────────────── log "[3/4] Installing dependencies on VM (~2 min)..." gcloud compute ssh "$VM_NAME" \ --zone="$VM_ZONE" \ --project="$YOUR_PROJECT" \ --command="bash -s" <<'REMOTE_SETUP' set -euo pipefail export DEBIAN_FRONTEND=noninteractive sudo apt-get update -qq sudo apt-get install -y apt-transport-https ca-certificates gnupg curl parallel rclone curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg \ | sudo gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \ | sudo tee /etc/apt/sources.list.d/google-cloud-sdk.list >/dev/null sudo apt-get update -qq sudo apt-get install -y google-cloud-cli chmod +x ~/roda.sh echo "Dependencies installed." REMOTE_SETUP log " Dependencies ready." # ── Step 4/4: Run the export script interactively ────────────────────── log "[4/4] Launching roda.sh on VM — answer prompts as they appear." gcloud compute ssh "$VM_NAME" \ --zone="$VM_ZONE" \ --project="$YOUR_PROJECT" \ -- bash ~/roda.sh # ── Cleanup: Delete VM ────────────────────────────────────────────────── echo "" echo "============================================================" echo " CLEANUP" echo "============================================================" read -rp "Delete VM instance $VM_NAME? [y/N] " del_vm if [[ "$del_vm" =~ ^[Yy]$ ]]; then log "Deleting VM $VM_NAME ..." gcloud compute instances delete "$VM_NAME" \ --zone="$VM_ZONE" \ --project="$YOUR_PROJECT" \ --quiet log "VM deleted." else log "VM kept. To delete manually:" log " gcloud compute instances delete $VM_NAME --zone=$VM_ZONE --project=$YOUR_PROJECT" fi exit 0 fi # ----------------------------------------------------------------------------- # STEP 1 — Create GCS bucket in US region (same as basedosdados) # ----------------------------------------------------------------------------- if $DRY_RUN; then log "[DRY RUN] Would create GCS bucket: gs://$BUCKET_NAME in region $BUCKET_REGION" else log "Creating GCS bucket: gs://$BUCKET_NAME in region $BUCKET_REGION" if gsutil ls "gs://$BUCKET_NAME" &>/dev/null; then log "Bucket already exists, skipping creation." else gsutil mb \ -p "$YOUR_PROJECT" \ -l "$BUCKET_REGION" \ -b on \ "gs://$BUCKET_NAME" log "Bucket created: gs://$BUCKET_NAME" fi fi # Resume support: load already-done tables/transfers touch "$DONE_FILE" "$FAILED_FILE" "$DONE_TRANSFERS_FILE" # ----------------------------------------------------------------------------- # STEP 2 — Build the full table list from the basedosdados project # # We auto-discover all datasets and tables via the BQ API so we don't rely # on a hardcoded list. This also detects any new tables added since the # tables-summary.md was written. # # Atomicity: we write to a .tmp file and mv it into place only on success, # so an interrupted run never leaves a partial list behind. # ----------------------------------------------------------------------------- log "Discovering all datasets in project: $SOURCE_PROJECT ..." TABLE_LIST_FILE="all_tables.txt" TABLE_LIST_TMP="${TABLE_LIST_FILE}.tmp" if [[ ! -f "$TABLE_LIST_FILE" ]]; then bq ls --project_id="$SOURCE_PROJECT" --max_results=10000 --format=json 2>/dev/null \ | python3 -c " import json, sys datasets = json.load(sys.stdin) for ds in datasets: print(ds['datasetReference']['datasetId']) " > /tmp/datasets.txt log "Found $(wc -l < /tmp/datasets.txt) datasets. Listing tables in parallel..." TMP_TABLE_DIR=$(mktemp -d) list_dataset_tables() { local dataset="$1" local source="$2" local tmp_dir="$3" bq ls \ --project_id="$source" \ --dataset_id="$source:$dataset" \ --max_results=10000 \ --format=json 2>/dev/null \ | python3 -c " import json, sys data = sys.stdin.read() if not data.strip(): sys.exit(0) for t in json.loads(data): ref = t.get('tableReference', {}) if t.get('type') in ('TABLE', 'EXTERNAL'): print(ref['datasetId'] + '.' + ref['tableId']) " > "$tmp_dir/$dataset.txt" } export -f list_dataset_tables parallel --jobs 16 list_dataset_tables {} "$SOURCE_PROJECT" "$TMP_TABLE_DIR" < /tmp/datasets.txt cat "$TMP_TABLE_DIR"/*.txt | sort > "$TABLE_LIST_TMP" rm -rf "$TMP_TABLE_DIR" mv "$TABLE_LIST_TMP" "$TABLE_LIST_FILE" log "Total tables discovered: $(wc -l < "$TABLE_LIST_FILE")" else log "Reusing existing table list: $TABLE_LIST_FILE ($(wc -l < "$TABLE_LIST_FILE") tables)" fi # ----------------------------------------------------------------------------- # DRY RUN — show table count and exit # ----------------------------------------------------------------------------- if $DRY_RUN; then TOTAL=$(wc -l < "$TABLE_LIST_FILE") log "[DRY RUN] $TOTAL tables found. No exports will run." log "[DRY RUN] Estimating total size via bq show in parallel (this may take a while)..." get_table_bytes() { local table="$1" local source="$2" local dataset table_id dataset=$(echo "$table" | cut -d. -f1) table_id=$(echo "$table" | cut -d. -f2) bq show --format=json "${source}:${dataset}.${table_id}" 2>/dev/null \ | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('numBytes','0'))" 2>/dev/null \ || echo 0 } export -f get_table_bytes TOTAL_BYTES=$(parallel --jobs 16 get_table_bytes {} "$SOURCE_PROJECT" < "$TABLE_LIST_FILE" \ | awk '{s+=$1} END{print s+0}') TOTAL_GB=$(echo "scale=2; $TOTAL_BYTES / 1073741824" | bc) # Parquet+zstd typically compresses structured data 5–10x vs BigQuery's raw numBytes COMPRESSED_LOW=$(echo "scale=2; $TOTAL_GB / 10" | bc) COMPRESSED_HIGH=$(echo "scale=2; $TOTAL_GB / 5" | bc) EGRESS_LOW=$(echo "scale=2; $COMPRESSED_LOW * 0.08" | bc) EGRESS_HIGH=$(echo "scale=2; $COMPRESSED_HIGH * 0.12" | bc) log "[DRY RUN] BigQuery raw size (uncompressed): ~${TOTAL_GB} GB" log "[DRY RUN] Estimated Parquet+zstd size: ~${COMPRESSED_LOW}–${COMPRESSED_HIGH} GB" log "[DRY RUN] Estimated GCS→Hetzner egress cost: USD ${EGRESS_LOW}–${EGRESS_HIGH}" log "[DRY RUN] Done. Remove --dry-run to start the actual export." exit 0 fi # ----------------------------------------------------------------------------- # COST WARNING — confirm before starting export # ----------------------------------------------------------------------------- echo "" echo "============================================================" echo " COST WARNING" echo " Transferring data from GCS to Hetzner costs ~\$0.08-0.12/GB" echo " in internet egress fees charged to: $YOUR_PROJECT" echo " Run with --dry-run first to estimate the total size." echo "============================================================" echo "" read -rp "Press ENTER to start the export, or Ctrl+C to abort: " # ----------------------------------------------------------------------------- # STEP 3 — Export function (called in parallel) # ----------------------------------------------------------------------------- export_table() { local table="$1" local bucket="$2" local project="$3" local source="$4" local done_file="$5" local failed_file="$6" local log_file="$7" # Skip if already done if grep -qxF "$table" "$done_file" 2>/dev/null; then echo "[SKIP] $table (already exported)" >> "$log_file" return 0 fi local dataset table_id gcs_prefix dataset=$(echo "$table" | cut -d. -f1) table_id=$(echo "$table" | cut -d. -f2) gcs_prefix="gs://$bucket/$dataset/$table_id" echo "[START] Exporting $source:$table → $gcs_prefix/*.parquet" >> "$log_file" # Run bq extract with retry (up to 3 attempts) # Skip retries immediately if the error is a known incompatible type local attempt=0 local success=false local output while [[ $attempt -lt 3 ]]; do attempt=$((attempt + 1)) output=$(bq extract \ --project_id="$project" \ --destination_format=PARQUET \ --compression=ZSTD \ --location=US \ "${source}:${dataset}.${table_id}" \ "${gcs_prefix}/*.parquet" \ 2>&1) local exit_code=$? echo "$output" >> "$log_file" if [[ $exit_code -eq 0 ]]; then success=true break fi # Detect permanently incompatible types — no point retrying if echo "$output" | grep -qi "not supported\|unsupported type\|GEOGRAPHY\|JSON type"; then echo "[SKIP_INCOMPATIBLE] $table — unsupported column type, skipping retries" >> "$log_file" flock "$failed_file" bash -c "echo '[INCOMPATIBLE] $table' >> '$failed_file'" return 0 fi # Detect access/permission errors — no point retrying if echo "$output" | grep -qi "access denied\|permission denied\|not authorized\|403\|does not exist\|Not found"; then echo "[SKIP_ACCESS] $table — access denied or not found, skipping retries" >> "$log_file" flock "$failed_file" bash -c "echo '[ACCESS_DENIED] $table' >> '$failed_file'" return 0 fi echo "[RETRY $attempt/3] $table" >> "$log_file" sleep $((attempt * 10)) done if $success; then # flock prevents race condition when multiple workers write concurrently flock "$done_file" bash -c "echo '$table' >> '$done_file'" echo "[DONE] $table" >> "$log_file" else flock "$failed_file" bash -c "echo '$table' >> '$failed_file'" echo "[FAIL] $table after 3 attempts" >> "$log_file" fi } export -f export_table # ----------------------------------------------------------------------------- # STEP 4 — Run exports in parallel # ----------------------------------------------------------------------------- log "Starting parallel exports ($PARALLEL_EXPORTS workers)..." log "Progress is logged to: $LOG_FILE" log "Failed tables will be written to: $FAILED_FILE" # Filter out already-done tables comm -23 \ <(sort "$TABLE_LIST_FILE") \ <(sort "$DONE_FILE") \ | parallel \ --jobs "$PARALLEL_EXPORTS" \ --progress \ --bar \ export_table {} "$BUCKET_NAME" "$YOUR_PROJECT" "$SOURCE_PROJECT" \ "$DONE_FILE" "$FAILED_FILE" "$LOG_FILE" \ || true # failures are tracked in $FAILED_FILE; don't let parallel's exit code abort the script TOTAL=$(wc -l < "$TABLE_LIST_FILE") DONE=$(wc -l < "$DONE_FILE") FAILED=$(wc -l < "$FAILED_FILE") log "Export phase complete: $DONE/$TOTAL done, $FAILED failed" if [[ $FAILED -gt 0 ]]; then log "Failed tables:" cat "$FAILED_FILE" | tee -a "$LOG_FILE" log "To retry failed tables only, run: bash $0 --retry-failed" fi # ----------------------------------------------------------------------------- # STEP 5 — Transfer GCS → Hetzner Object Storage via rclone (no local staging) # # rclone streams data directly between GCS and S3 through RAM only — # no local disk required. # ----------------------------------------------------------------------------- log "Starting transfer to Hetzner Object Storage ($HETZNER_S3_ENDPOINT)..." TRANSFER_LOG_DIR=$(mktemp -d) # Compute total datasets in GCS bucket once (used for progress display) TRANSFER_TOTAL=$(gsutil ls "gs://$BUCKET_NAME/" | wc -l) export TRANSFER_TOTAL download_dataset() { local dataset="$1" local bucket="$2" local s3_bucket="$3" local s3_concurrency="$4" local done_transfers_file="$5" local log_dir="$6" local total="$7" local dataset_log="$log_dir/${dataset}.log" # Resume: skip datasets already transferred if grep -qxF "$dataset" "$done_transfers_file" 2>/dev/null; then echo "[SKIP_TRANSFER] $dataset (already transferred)" > "$dataset_log" return 0 fi echo "[TRANSFER] gs://$bucket/$dataset/ → hz:$s3_bucket/$dataset/" > "$dataset_log" # Named remotes bd: (GCS) and hz: (Hetzner S3) are configured via RCLONE_CONFIG_* env vars if rclone copy \ "bd:$bucket/$dataset/" \ "hz:$s3_bucket/$dataset/" \ --transfers "$s3_concurrency" \ --s3-upload-concurrency "$s3_concurrency" \ --progress \ >> "$dataset_log" 2>&1; then flock "$done_transfers_file" bash -c "echo '$dataset' >> '$done_transfers_file'" echo "[TRANSFERRED] $dataset" >> "$dataset_log" local done_count done_count=$(wc -l < "$done_transfers_file") local pct=$(( done_count * 100 / total )) echo "[${done_count}/${total}] ${pct}% datasets transferidos" else echo "[TRANSFER FAIL] rclone failed for $dataset" >> "$dataset_log" return 1 fi } export -f download_dataset # Get list of exported datasets, skipping already-transferred ones comm -23 \ <(gsutil ls "gs://$BUCKET_NAME/" | sed 's|gs://[^/]*/||;s|/||' | sort -u) \ <(sort "$DONE_TRANSFERS_FILE") \ | parallel \ --jobs "$PARALLEL_UPLOADS" \ download_dataset {} "$BUCKET_NAME" "$HETZNER_S3_BUCKET" "$S3_CONCURRENCY" "$DONE_TRANSFERS_FILE" "$TRANSFER_LOG_DIR" "$TRANSFER_TOTAL" \ || true # failures are tracked per-dataset; don't abort # Merge per-dataset logs into main log in order for f in $(ls "$TRANSFER_LOG_DIR"/*.log 2>/dev/null | sort); do cat "$f" >> "$LOG_FILE" done rm -rf "$TRANSFER_LOG_DIR" log "Transfer complete." # ----------------------------------------------------------------------------- # STEP 6 — Verify file counts on Hetzner Object Storage vs GCS # ----------------------------------------------------------------------------- log "Verifying file counts..." GCS_COUNT=$(gsutil ls -r "gs://$BUCKET_NAME/**" | grep '\.parquet$' | wc -l) S3_COUNT=$(rclone ls "hz:$HETZNER_S3_BUCKET" 2>/dev/null | grep '\.parquet$' | wc -l) log "GCS parquet files: $GCS_COUNT" log "S3 parquet files: $S3_COUNT" if [[ "$GCS_COUNT" -eq "$S3_COUNT" ]]; then log "File counts match. Transfer verified." else log_err "Count mismatch! GCS=$GCS_COUNT S3=$S3_COUNT" log_err "Re-run the script to resume failed datasets or check $LOG_FILE for errors." fi # ----------------------------------------------------------------------------- # STEP 7 — Clean up GCS bucket to stop storage charges # ----------------------------------------------------------------------------- read -rp "Delete GCS bucket gs://$BUCKET_NAME to stop storage charges? [y/N] " confirm if [[ "$confirm" =~ ^[Yy]$ ]]; then log "Deleting bucket gs://$BUCKET_NAME ..." gsutil -m rm -r "gs://$BUCKET_NAME" gsutil rb "gs://$BUCKET_NAME" log "Bucket deleted. Storage charges stopped." else log "Bucket kept. Remember to delete it later: gsutil -m rm -r gs://$BUCKET_NAME && gsutil rb gs://$BUCKET_NAME" fi log "All done! Data is at s3://$HETZNER_S3_BUCKET/ ($HETZNER_S3_ENDPOINT)" log "Total exported: $DONE tables | Failed: $FAILED tables" log "See $LOG_FILE for full details."