Files
baseldosdados/roda.sh

528 lines
20 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# =============================================================================
# export_basedosdados.sh
# Exports all basedosdados BigQuery tables → GCS (Parquet+zstd) → Hetzner Object Storage
#
# Prerequisites (run once before this script):
# gcloud auth login
# gcloud auth application-default login
# gcloud config set project YOUR_PROJECT_ID
# cp .env.example .env # then fill in your values
#
# Usage:
# chmod +x export_basedosdados.sh
# ./export_basedosdados.sh # full run (locally)
# ./export_basedosdados.sh --dry-run # list tables + estimated sizes, no export
# ./export_basedosdados.sh --gcloud-run # create GCP VM → run there → delete VM
# =============================================================================
set -euo pipefail
# Add util-linux to PATH on macOS (provides flock)
[[ -d "/opt/homebrew/opt/util-linux/bin" ]] && export PATH="/opt/homebrew/opt/util-linux/bin:$PATH"
# Load .env if present
if [[ -f "$(dirname "$0")/.env" ]]; then
set -a
# shellcheck source=.env
source "$(dirname "$0")/.env"
set +a
fi
DRY_RUN=false
GCLOUD_RUN=false
if [[ "${1:-}" == "--dry-run" ]]; then
DRY_RUN=true
elif [[ "${1:-}" == "--gcloud-run" ]]; then
GCLOUD_RUN=true
fi
# -----------------------------------------------------------------------------
# LOGGING
# -----------------------------------------------------------------------------
LOG_FILE="export_$(date +%Y%m%d_%H%M%S).log"
FAILED_FILE="failed_tables.txt"
DONE_FILE="done_tables.txt"
DONE_TRANSFERS_FILE="done_transfers.txt"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; }
log_err() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" | tee -a "$LOG_FILE" >&2; }
# -----------------------------------------------------------------------------
# STEP 0 — Verify dependencies
# -----------------------------------------------------------------------------
log "Checking dependencies..."
if $GCLOUD_RUN; then
for cmd in gcloud; do
if ! command -v "$cmd" &>/dev/null; then
log_err "'$cmd' not found. Install google-cloud-sdk."
exit 1
fi
done
else
for cmd in bq gcloud gsutil parallel rclone flock; do
if ! command -v "$cmd" &>/dev/null; then
log_err "'$cmd' not found. Install google-cloud-sdk, GNU parallel, and rclone."
exit 1
fi
done
fi
# Validate S3 credentials
if [[ -z "${AWS_ACCESS_KEY_ID:-}" || -z "${AWS_SECRET_ACCESS_KEY:-}" ]]; then
log_err "Credenciais S3 não encontradas. Preencha o .env com AWS_ACCESS_KEY_ID e AWS_SECRET_ACCESS_KEY."
exit 1
fi
# Configure rclone remotes via env vars — no rclone.conf or inline credentials needed.
# GCS remote (bd:) uses Application Default Credentials from gcloud auth application-default login.
# Hetzner S3 remote (hz:) uses the credentials from .env, kept out of the process command line.
export RCLONE_CONFIG_BD_TYPE="google cloud storage"
export RCLONE_CONFIG_BD_BUCKET_POLICY_ONLY="true"
export RCLONE_CONFIG_HZ_TYPE="s3"
export RCLONE_CONFIG_HZ_PROVIDER="Other"
export RCLONE_CONFIG_HZ_ENDPOINT="$HETZNER_S3_ENDPOINT"
export RCLONE_CONFIG_HZ_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID"
export RCLONE_CONFIG_HZ_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY"
# =============================================================================
# GCLOUD RUN — create a Compute Engine VM, run the export there, then clean up
# =============================================================================
if $GCLOUD_RUN; then
VM_NAME="${GCP_VM_NAME:-bd-export-vm}"
VM_ZONE="${GCP_VM_ZONE:-us-central1-a}"
SCRIPT_PATH="$(realpath "$0")"
ENV_PATH="$(dirname "$SCRIPT_PATH")/.env"
log "=============================="
log " GCLOUD RUN MODE"
log "=============================="
# ── Step 1/4: Create instance ───────────────────────────────────────────
log "[1/4] Creating VM: $VM_NAME ($VM_ZONE) ..."
if gcloud compute instances describe "$VM_NAME" \
--zone="$VM_ZONE" --project="$YOUR_PROJECT" &>/dev/null; then
log " VM already exists, reusing it."
else
gcloud compute instances create "$VM_NAME" \
--project="$YOUR_PROJECT" \
--zone="$VM_ZONE" \
--machine-type=e2-standard-4 \
--image-family=debian-12 \
--image-project=debian-cloud \
--boot-disk-size=20GB \
--scopes=cloud-platform
log " VM created."
fi
# ── Step 2/4: Wait for SSH + copy files ────────────────────────────────
log "[2/4] Waiting for SSH and copying files..."
for i in {1..18}; do
if gcloud compute ssh "$VM_NAME" \
--zone="$VM_ZONE" --project="$YOUR_PROJECT" \
--command="echo ready" 2>/dev/null; then
break
fi
log " SSH not ready yet ($i/18), retrying in 10s..."
sleep 10
done
gcloud compute scp "$SCRIPT_PATH" "$ENV_PATH" \
"$VM_NAME:~/" \
--zone="$VM_ZONE" \
--project="$YOUR_PROJECT"
log " Files copied."
# ── Step 3/4: Install dependencies ─────────────────────────────────────
log "[3/4] Installing dependencies on VM (~2 min)..."
gcloud compute ssh "$VM_NAME" \
--zone="$VM_ZONE" \
--project="$YOUR_PROJECT" \
--command="bash -s" <<'REMOTE_SETUP'
set -euo pipefail
export DEBIAN_FRONTEND=noninteractive
sudo apt-get update -qq
sudo apt-get install -y apt-transport-https ca-certificates gnupg curl parallel rclone
curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg \
| sudo gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \
| sudo tee /etc/apt/sources.list.d/google-cloud-sdk.list >/dev/null
sudo apt-get update -qq
sudo apt-get install -y google-cloud-cli
chmod +x ~/roda.sh
echo "Dependencies installed."
REMOTE_SETUP
log " Dependencies ready."
# ── Step 4/4: Run the export script interactively ──────────────────────
log "[4/4] Launching roda.sh on VM — answer prompts as they appear."
gcloud compute ssh "$VM_NAME" \
--zone="$VM_ZONE" \
--project="$YOUR_PROJECT" \
-- bash ~/roda.sh
# ── Cleanup: Delete VM ──────────────────────────────────────────────────
echo ""
echo "============================================================"
echo " CLEANUP"
echo "============================================================"
read -rp "Delete VM instance $VM_NAME? [y/N] " del_vm
if [[ "$del_vm" =~ ^[Yy]$ ]]; then
log "Deleting VM $VM_NAME ..."
gcloud compute instances delete "$VM_NAME" \
--zone="$VM_ZONE" \
--project="$YOUR_PROJECT" \
--quiet
log "VM deleted."
else
log "VM kept. To delete manually:"
log " gcloud compute instances delete $VM_NAME --zone=$VM_ZONE --project=$YOUR_PROJECT"
fi
exit 0
fi
# -----------------------------------------------------------------------------
# STEP 1 — Create GCS bucket in US region (same as basedosdados)
# -----------------------------------------------------------------------------
if $DRY_RUN; then
log "[DRY RUN] Would create GCS bucket: gs://$BUCKET_NAME in region $BUCKET_REGION"
else
log "Creating GCS bucket: gs://$BUCKET_NAME in region $BUCKET_REGION"
if gsutil ls "gs://$BUCKET_NAME" &>/dev/null; then
log "Bucket already exists, skipping creation."
else
gsutil mb \
-p "$YOUR_PROJECT" \
-l "$BUCKET_REGION" \
-b on \
"gs://$BUCKET_NAME"
log "Bucket created: gs://$BUCKET_NAME"
fi
fi
# Resume support: load already-done tables/transfers
touch "$DONE_FILE" "$FAILED_FILE" "$DONE_TRANSFERS_FILE"
# -----------------------------------------------------------------------------
# STEP 2 — Build the full table list from the basedosdados project
#
# We auto-discover all datasets and tables via the BQ API so we don't rely
# on a hardcoded list. This also detects any new tables added since the
# tables-summary.md was written.
#
# Atomicity: we write to a .tmp file and mv it into place only on success,
# so an interrupted run never leaves a partial list behind.
# -----------------------------------------------------------------------------
log "Discovering all datasets in project: $SOURCE_PROJECT ..."
TABLE_LIST_FILE="all_tables.txt"
TABLE_LIST_TMP="${TABLE_LIST_FILE}.tmp"
if [[ ! -f "$TABLE_LIST_FILE" ]]; then
bq ls --project_id="$SOURCE_PROJECT" --max_results=10000 --format=json 2>/dev/null \
| python3 -c "
import json, sys
datasets = json.load(sys.stdin)
for ds in datasets:
print(ds['datasetReference']['datasetId'])
" > /tmp/datasets.txt
log "Found $(wc -l < /tmp/datasets.txt) datasets. Listing tables in parallel..."
TMP_TABLE_DIR=$(mktemp -d)
list_dataset_tables() {
local dataset="$1"
local source="$2"
local tmp_dir="$3"
bq ls \
--project_id="$source" \
--dataset_id="$source:$dataset" \
--max_results=10000 \
--format=json 2>/dev/null \
| python3 -c "
import json, sys
data = sys.stdin.read()
if not data.strip():
sys.exit(0)
for t in json.loads(data):
ref = t.get('tableReference', {})
if t.get('type') in ('TABLE', 'EXTERNAL'):
print(ref['datasetId'] + '.' + ref['tableId'])
" > "$tmp_dir/$dataset.txt"
}
export -f list_dataset_tables
parallel --jobs 16 list_dataset_tables {} "$SOURCE_PROJECT" "$TMP_TABLE_DIR" < /tmp/datasets.txt
cat "$TMP_TABLE_DIR"/*.txt | sort > "$TABLE_LIST_TMP"
rm -rf "$TMP_TABLE_DIR"
mv "$TABLE_LIST_TMP" "$TABLE_LIST_FILE"
log "Total tables discovered: $(wc -l < "$TABLE_LIST_FILE")"
else
log "Reusing existing table list: $TABLE_LIST_FILE ($(wc -l < "$TABLE_LIST_FILE") tables)"
fi
# -----------------------------------------------------------------------------
# DRY RUN — show table count and exit
# -----------------------------------------------------------------------------
if $DRY_RUN; then
TOTAL=$(wc -l < "$TABLE_LIST_FILE")
log "[DRY RUN] $TOTAL tables found. No exports will run."
log "[DRY RUN] Estimating total size via bq show in parallel (this may take a while)..."
get_table_bytes() {
local table="$1"
local source="$2"
local dataset table_id
dataset=$(echo "$table" | cut -d. -f1)
table_id=$(echo "$table" | cut -d. -f2)
bq show --format=json "${source}:${dataset}.${table_id}" 2>/dev/null \
| python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('numBytes','0'))" 2>/dev/null \
|| echo 0
}
export -f get_table_bytes
TOTAL_BYTES=$(parallel --jobs 16 get_table_bytes {} "$SOURCE_PROJECT" < "$TABLE_LIST_FILE" \
| awk '{s+=$1} END{print s+0}')
TOTAL_GB=$(echo "scale=2; $TOTAL_BYTES / 1073741824" | bc)
# Parquet+zstd typically compresses structured data 510x vs BigQuery's raw numBytes
COMPRESSED_LOW=$(echo "scale=2; $TOTAL_GB / 10" | bc)
COMPRESSED_HIGH=$(echo "scale=2; $TOTAL_GB / 5" | bc)
EGRESS_LOW=$(echo "scale=2; $COMPRESSED_LOW * 0.08" | bc)
EGRESS_HIGH=$(echo "scale=2; $COMPRESSED_HIGH * 0.12" | bc)
log "[DRY RUN] BigQuery raw size (uncompressed): ~${TOTAL_GB} GB"
log "[DRY RUN] Estimated Parquet+zstd size: ~${COMPRESSED_LOW}${COMPRESSED_HIGH} GB"
log "[DRY RUN] Estimated GCS→Hetzner egress cost: USD ${EGRESS_LOW}${EGRESS_HIGH}"
log "[DRY RUN] Done. Remove --dry-run to start the actual export."
exit 0
fi
# -----------------------------------------------------------------------------
# COST WARNING — confirm before starting export
# -----------------------------------------------------------------------------
echo ""
echo "============================================================"
echo " COST WARNING"
echo " Transferring data from GCS to Hetzner costs ~\$0.08-0.12/GB"
echo " in internet egress fees charged to: $YOUR_PROJECT"
echo " Run with --dry-run first to estimate the total size."
echo "============================================================"
echo ""
read -rp "Press ENTER to start the export, or Ctrl+C to abort: "
# -----------------------------------------------------------------------------
# STEP 3 — Export function (called in parallel)
# -----------------------------------------------------------------------------
export_table() {
local table="$1"
local bucket="$2"
local project="$3"
local source="$4"
local done_file="$5"
local failed_file="$6"
local log_file="$7"
# Skip if already done
if grep -qxF "$table" "$done_file" 2>/dev/null; then
echo "[SKIP] $table (already exported)" >> "$log_file"
return 0
fi
local dataset table_id gcs_prefix
dataset=$(echo "$table" | cut -d. -f1)
table_id=$(echo "$table" | cut -d. -f2)
gcs_prefix="gs://$bucket/$dataset/$table_id"
echo "[START] Exporting $source:$table$gcs_prefix/*.parquet" >> "$log_file"
# Run bq extract with retry (up to 3 attempts)
# Skip retries immediately if the error is a known incompatible type
local attempt=0
local success=false
local output
while [[ $attempt -lt 3 ]]; do
attempt=$((attempt + 1))
output=$(bq extract \
--project_id="$project" \
--destination_format=PARQUET \
--compression=ZSTD \
--location=US \
"${source}:${dataset}.${table_id}" \
"${gcs_prefix}/*.parquet" \
2>&1)
local exit_code=$?
echo "$output" >> "$log_file"
if [[ $exit_code -eq 0 ]]; then
success=true
break
fi
# Detect permanently incompatible types — no point retrying
if echo "$output" | grep -qi "not supported\|unsupported type\|GEOGRAPHY\|JSON type"; then
echo "[SKIP_INCOMPATIBLE] $table — unsupported column type, skipping retries" >> "$log_file"
flock "$failed_file" bash -c "echo '[INCOMPATIBLE] $table' >> '$failed_file'"
return 0
fi
# Detect access/permission errors — no point retrying
if echo "$output" | grep -qi "access denied\|permission denied\|not authorized\|403\|does not exist\|Not found"; then
echo "[SKIP_ACCESS] $table — access denied or not found, skipping retries" >> "$log_file"
flock "$failed_file" bash -c "echo '[ACCESS_DENIED] $table' >> '$failed_file'"
return 0
fi
echo "[RETRY $attempt/3] $table" >> "$log_file"
sleep $((attempt * 10))
done
if $success; then
# flock prevents race condition when multiple workers write concurrently
flock "$done_file" bash -c "echo '$table' >> '$done_file'"
echo "[DONE] $table" >> "$log_file"
else
flock "$failed_file" bash -c "echo '$table' >> '$failed_file'"
echo "[FAIL] $table after 3 attempts" >> "$log_file"
fi
}
export -f export_table
# -----------------------------------------------------------------------------
# STEP 4 — Run exports in parallel
# -----------------------------------------------------------------------------
log "Starting parallel exports ($PARALLEL_EXPORTS workers)..."
log "Progress is logged to: $LOG_FILE"
log "Failed tables will be written to: $FAILED_FILE"
# Filter out already-done tables
comm -23 \
<(sort "$TABLE_LIST_FILE") \
<(sort "$DONE_FILE") \
| parallel \
--jobs "$PARALLEL_EXPORTS" \
--progress \
--bar \
export_table {} "$BUCKET_NAME" "$YOUR_PROJECT" "$SOURCE_PROJECT" \
"$DONE_FILE" "$FAILED_FILE" "$LOG_FILE" \
|| true # failures are tracked in $FAILED_FILE; don't let parallel's exit code abort the script
TOTAL=$(wc -l < "$TABLE_LIST_FILE")
DONE=$(wc -l < "$DONE_FILE")
FAILED=$(wc -l < "$FAILED_FILE")
log "Export phase complete: $DONE/$TOTAL done, $FAILED failed"
if [[ $FAILED -gt 0 ]]; then
log "Failed tables:"
cat "$FAILED_FILE" | tee -a "$LOG_FILE"
log "To retry failed tables only, run: bash $0 --retry-failed"
fi
# -----------------------------------------------------------------------------
# STEP 5 — Transfer GCS → Hetzner Object Storage via rclone (no local staging)
#
# rclone streams data directly between GCS and S3 through RAM only —
# no local disk required.
# -----------------------------------------------------------------------------
log "Starting transfer to Hetzner Object Storage ($HETZNER_S3_ENDPOINT)..."
TRANSFER_LOG_DIR=$(mktemp -d)
# Compute total datasets in GCS bucket once (used for progress display)
TRANSFER_TOTAL=$(gsutil ls "gs://$BUCKET_NAME/" | wc -l)
export TRANSFER_TOTAL
download_dataset() {
local dataset="$1"
local bucket="$2"
local s3_bucket="$3"
local s3_concurrency="$4"
local done_transfers_file="$5"
local log_dir="$6"
local total="$7"
local dataset_log="$log_dir/${dataset}.log"
# Resume: skip datasets already transferred
if grep -qxF "$dataset" "$done_transfers_file" 2>/dev/null; then
echo "[SKIP_TRANSFER] $dataset (already transferred)" > "$dataset_log"
return 0
fi
echo "[TRANSFER] gs://$bucket/$dataset/ → hz:$s3_bucket/$dataset/" > "$dataset_log"
# Named remotes bd: (GCS) and hz: (Hetzner S3) are configured via RCLONE_CONFIG_* env vars
if rclone copy \
"bd:$bucket/$dataset/" \
"hz:$s3_bucket/$dataset/" \
--transfers "$s3_concurrency" \
--s3-upload-concurrency "$s3_concurrency" \
--progress \
>> "$dataset_log" 2>&1; then
flock "$done_transfers_file" bash -c "echo '$dataset' >> '$done_transfers_file'"
echo "[TRANSFERRED] $dataset" >> "$dataset_log"
local done_count
done_count=$(wc -l < "$done_transfers_file")
local pct=$(( done_count * 100 / total ))
echo "[${done_count}/${total}] ${pct}% datasets transferidos"
else
echo "[TRANSFER FAIL] rclone failed for $dataset" >> "$dataset_log"
return 1
fi
}
export -f download_dataset
# Get list of exported datasets, skipping already-transferred ones
comm -23 \
<(gsutil ls "gs://$BUCKET_NAME/" | sed 's|gs://[^/]*/||;s|/||' | sort -u) \
<(sort "$DONE_TRANSFERS_FILE") \
| parallel \
--jobs "$PARALLEL_UPLOADS" \
download_dataset {} "$BUCKET_NAME" "$HETZNER_S3_BUCKET" "$S3_CONCURRENCY" "$DONE_TRANSFERS_FILE" "$TRANSFER_LOG_DIR" "$TRANSFER_TOTAL" \
|| true # failures are tracked per-dataset; don't abort
# Merge per-dataset logs into main log in order
for f in $(ls "$TRANSFER_LOG_DIR"/*.log 2>/dev/null | sort); do
cat "$f" >> "$LOG_FILE"
done
rm -rf "$TRANSFER_LOG_DIR"
log "Transfer complete."
# -----------------------------------------------------------------------------
# STEP 6 — Verify file counts on Hetzner Object Storage vs GCS
# -----------------------------------------------------------------------------
log "Verifying file counts..."
GCS_COUNT=$(gsutil ls -r "gs://$BUCKET_NAME/**" | grep '\.parquet$' | wc -l)
S3_COUNT=$(rclone ls "hz:$HETZNER_S3_BUCKET" 2>/dev/null | grep '\.parquet$' | wc -l)
log "GCS parquet files: $GCS_COUNT"
log "S3 parquet files: $S3_COUNT"
if [[ "$GCS_COUNT" -eq "$S3_COUNT" ]]; then
log "File counts match. Transfer verified."
else
log_err "Count mismatch! GCS=$GCS_COUNT S3=$S3_COUNT"
log_err "Re-run the script to resume failed datasets or check $LOG_FILE for errors."
fi
# -----------------------------------------------------------------------------
# STEP 7 — Clean up GCS bucket to stop storage charges
# -----------------------------------------------------------------------------
read -rp "Delete GCS bucket gs://$BUCKET_NAME to stop storage charges? [y/N] " confirm
if [[ "$confirm" =~ ^[Yy]$ ]]; then
log "Deleting bucket gs://$BUCKET_NAME ..."
gsutil -m rm -r "gs://$BUCKET_NAME"
gsutil rb "gs://$BUCKET_NAME"
log "Bucket deleted. Storage charges stopped."
else
log "Bucket kept. Remember to delete it later: gsutil -m rm -r gs://$BUCKET_NAME && gsutil rb gs://$BUCKET_NAME"
fi
log "All done! Data is at s3://$HETZNER_S3_BUCKET/ ($HETZNER_S3_ENDPOINT)"
log "Total exported: $DONE tables | Failed: $FAILED tables"
log "See $LOG_FILE for full details."