feat: add --sync to export BQ tables directly to S3 without GCS intermediary

2026-03-29 17:39:13 +02:00
parent 43e5ae6723
commit 36acd1320c
2 changed files with 606 additions and 0 deletions
--- a/roda.sh
+++ b/roda.sh
@@ -31,10 +31,13 @@ fi
 DRY_RUN=false
 GCLOUD_RUN=false
 SYNC_RUN=false
 if [[ "${1:-}" == "--dry-run" ]]; then
  DRY_RUN=true
 elif [[ "${1:-}" == "--gcloud-run" ]]; then
  GCLOUD_RUN=true
 elif [[ "${1:-}" == "--sync" ]]; then
  SYNC_RUN=true
 fi
 # -----------------------------------------------------------------------------
@@ -74,6 +77,18 @@ if [[ -z "${AWS_ACCESS_KEY_ID:-}" || -z "${AWS_SECRET_ACCESS_KEY:-}" ]]; then
  exit 1
 fi
 # Validate GCP project (needed for --sync)
 if [[ -z "${GCP_PROJECT:-}" ]]; then
  if $SYNC_RUN; then
    if [[ -z "${YOUR_PROJECT:-}" ]]; then
      log_err "GCP_PROJECT não encontrado no .env. Adicione GCP_PROJECT ou YOUR_PROJECT."
      exit 1
    fi
    log "GCP_PROJECT not set, using YOUR_PROJECT: $YOUR_PROJECT"
    export GCP_PROJECT="$YOUR_PROJECT"
  fi
 fi
 # Configure rclone remotes via env vars — no rclone.conf or inline credentials needed.
 # GCS remote (bd:) uses Application Default Credentials from gcloud auth application-default login.
 # Hetzner S3 remote (hz:) uses the credentials from .env, kept out of the process command line.
@@ -182,6 +197,54 @@ REMOTE_SETUP
  exit 0
 fi
 # =============================================================================
 # SYNC — BigQuery → S3 direct (no GCS intermediary)
 # =============================================================================
 if $SYNC_RUN; then
  log "=============================="
  log " SYNC MODE — BigQuery → S3"
  log "=============================="
  # Check dependencies
  for cmd in python3; do
    if ! command -v "$cmd" &>/dev/null; then
      log_err "'$cmd' not found."
      exit 1
    fi
  done
  # Check Python dependencies (import name vs pip package name differs)
  PYTHON_CHECKS="google.cloud.bigquery:boto3:pandas:pyarrow"
  for check in $(echo "$PYTHON_CHECKS" | tr ':' '\n'); do
    module="${check}"
    if ! python3 -c "import ${module}" 2>/dev/null; then
      pip_pkg="${module}"
      log_err "Missing Python package: ${pip_pkg}. Run: pip install google-cloud-bigquery boto3 pandas pyarrow"
      exit 1
    fi
  done
  # Set GCP_PROJECT for the Python script
  export GCP_PROJECT="${GCP_PROJECT:-${YOUR_PROJECT}}"
  log "GCP project: $GCP_PROJECT"
  log "S3 bucket:   $HETZNER_S3_BUCKET"
  log "S3 endpoint: $HETZNER_S3_ENDPOINT"
  log ""
  if $DRY_RUN; then
    log "DRY RUN — listing tables only, no data will be transferred"
  fi
  # Run the sync script, filtering out --sync (roda.sh flag)
  SYNC_ARGS=()
  for arg in "$@"; do
    [[ "$arg" != "--sync" ]] && SYNC_ARGS+=("$arg")
  done
  python3 sync_bq_to_local.py "${SYNC_ARGS[@]+"${SYNC_ARGS[@]}"}"
  exit $?
 fi
 # -----------------------------------------------------------------------------
 # STEP 1 — Create GCS bucket in US region (same as basedosdados)
 # -----------------------------------------------------------------------------
--- a/sync_bq_to_local.py
+++ b/sync_bq_to_local.py
@@ -0,0 +1,543 @@
 #!/usr/bin/env python3
 """
 sync_bq_to_local.py
 Syncs missing tables from BigQuery (basedosdados project) to Hetzner S3,
 then registers them as DuckDB views.
 Usage:
    python3 sync_bq_to_local.py              # full sync
    python3 sync_bq_to_local.py --dry-run    # list missing tables only
    python3 sync_bq_to_local.py --resume      # resume from last run
 Prerequisites:
    gcloud auth application-default login
    GCP project with billing enabled (free tier: 1 TB/month)
 Environment (.env):
    GCP_PROJECT          - GCP project ID for billing
    HETZNER_S3_BUCKET   - S3 bucket name
    HETZNER_S3_ENDPOINT - S3 endpoint URL
    AWS_ACCESS_KEY_ID    - S3 access key
    AWS_SECRET_ACCESS_KEY - S3 secret key
 """
 import os
 import sys
 import json
 import argparse
 import logging
 import subprocess
 from datetime import datetime
 from pathlib import Path
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import boto3
 from botocore.config import Config as BotoConfig
 from google.cloud import bigquery
 # ---------------------------------------------------------------------------
 # Logging
 # ---------------------------------------------------------------------------
 LOG_FILE = f"sync_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler(sys.stdout),
    ],
 )
 log = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
 SOURCE_PROJECT = "basedosdados"
 MISSING_TABLES_FILE = "tasks/datasets_to_scrap.md"
 DONE_FILE = "done_sync.txt"
 FAILED_FILE = "failed_sync.txt"
 DATA_DIR = "data"
 PARQUET_DIR = "parquet"
 MAX_RETRIES = 3
 BATCH_SIZE = 1  # export one table at a time to manage memory
 WORKERS = 4  # parallel uploads
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def load_env():
    """Load required environment variables."""
    from dotenv import load_dotenv
    load_dotenv()
    required = [
        "GCP_PROJECT",
        "HETZNER_S3_BUCKET",
        "HETZNER_S3_ENDPOINT",
        "AWS_ACCESS_KEY_ID",
        "AWS_SECRET_ACCESS_KEY",
    ]
    missing = [v for v in required if not os.environ.get(v)]
    if missing:
        log.error("Missing env vars: %s", missing)
        sys.exit(1)
    return {v: os.environ[v] for v in required}
 def get_s3_client(env):
    """Create boto3 S3 client configured for Hetzner."""
    return boto3.client(
        "s3",
        endpoint_url=env["HETZNER_S3_ENDPOINT"],
        aws_access_key_id=env["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=env["AWS_SECRET_ACCESS_KEY"],
        config=BotoConfig(s3={"addressing_style": "path"}),
    )
 def get_bq_client():
    """Create BigQuery client using Application Default Credentials."""
    try:
        os.environ["GOOGLE_CLOUD_PROJECT"] = os.environ.get("GCP_PROJECT", "")
        os.environ["GCLOUD_PROJECT"] = os.environ.get("GCP_PROJECT", "")
        client = bigquery.Client(project=os.environ.get("GCP_PROJECT", ""))
        # Test the connection
        list(client.list_datasets(max_results=1))
        return client
    except Exception as e:
        log.error("BigQuery auth failed: %s", e)
        log.error("")
        log.error("Run these commands to authenticate:")
        log.error("  gcloud auth login")
        log.error("  gcloud auth application-default login")
        log.error("  gcloud config set project %s", os.environ.get("GCP_PROJECT", ""))
        log.error("")
        log.error("The free tier (1 TB/month) is sufficient — no credit card needed.")
        sys.exit(1)
 def list_bq_tables(bq_client):
    """List all tables in the basedosdados BigQuery project."""
    log.info("Discovering tables in BigQuery project: %s", SOURCE_PROJECT)
    tables = {}
    try:
        datasets = list(bq_client.list_datasets())
        log.info("Found %d datasets", len(datasets))
    except Exception as e:
        log.error("Failed to list datasets: %s", e)
        sys.exit(1)
    for dataset in datasets:
        try:
            tables_list = list(
                bq_client.list_tables(
                    f"{SOURCE_PROJECT}.{dataset.dataset_id}",
                    max_results=10000,
                )
            )
            for t in tables_list:
                tables[f"{dataset.dataset_id}.{t.table_id}"] = {
                    "dataset": dataset.dataset_id,
                    "table": t.table_id,
                    "full_id": f"{SOURCE_PROJECT}.{dataset.dataset_id}.{t.table_id}",
                    "schema": [f.name for f in t.schema] if t.schema else [],
                    "num_bytes": t.num_bytes,
                    "num_rows": t.num_rows,
                }
        except Exception as e:
            log.warning("Failed to list tables in dataset %s: %s", dataset.dataset_id, e)
    log.info("Total BigQuery tables discovered: %d", len(tables))
    return tables
 def list_s3_tables(s3_client, bucket):
    """List datasets/tables already exported to S3."""
    log.info("Discovering tables already in S3 bucket: %s", bucket)
    table_files = defaultdict(lambda: defaultdict(list))
    try:
        paginator = s3_client.get_paginator("list_objects_v2")
        for page in paginator.paginate(Bucket=bucket):
            for obj in page.get("Contents", []):
                key = obj["Key"]
                if not key.endswith(".parquet"):
                    continue
                parts = key.split("/")
                if len(parts) >= 3:
                    dataset, table = parts[0], parts[1]
                    table_files[dataset][table].append(key)
    except Exception as e:
        log.warning("S3 listing error (may be empty bucket): %s", e)
    tables = {}
    for dataset, t_dict in table_files.items():
        for table, files in t_dict.items():
            tables[f"{dataset}.{table}"] = files
    log.info("Total S3 tables discovered: %d", len(tables))
    return tables
 def parse_missing_tables_from_md(filepath):
    """Parse the missing tables from tasks/datasets_to_scrap.md.
    Returns a dict mapping 'dataset.table' -> description.
    Falls back to None (use all non-S3 tables) if file not found.
    """
    if not os.path.exists(filepath):
        log.warning("Missing file %s, using all non-S3 tables", filepath)
        return None
    log.info("Parsing missing tables from %s", filepath)
    with open(filepath) as f:
        content = f.read()
    missing = {}
    lines = content.split("\n")
    i = 0
    def next_nonempty(lines, i):
        while i < len(lines) and not lines[i].strip():
            i += 1
        return i
    while i < len(lines):
        line = lines[i].strip()
        # Find the Basedosdados.org section
        if "Basedosdados.org" in line and "Not in basedosdados.duckdb" in line:
            log.info("Found Basedosdados.org section at line %d", i + 1)
            i += 1
            break
        i += 1
    # Now parse table entries
    while i < len(lines):
        line = lines[i].strip()
        # End of section only on top-level ## headers, not ### subsections
        if line.startswith("## "):
            break
        # Skip separators and empty lines
        if not line or line.startswith("---") or "|---" in line:
            i += 1
            continue
        # Find rows with backtick-wrapped dataset names (e.g. | `br_abrinq_oca` | ...)
        if "`" in line and "|" in line:
            # Split by pipe, strip whitespace and backticks
            parts = [p.strip().strip("`").strip() for p in line.split("|")]
            # Filter empty parts
            parts = [p for p in parts if p]
            if len(parts) >= 2:
                dataset_raw = parts[0]
                # Check if it looks like a dataset name (br_*, eu_*, mundo_*, etc.)
                is_dataset = any(
                    dataset_raw.startswith(prefix)
                    for prefix in ("br_", "eu_", "mundo_", "nl_", "world_")
                )
                if is_dataset:
                    # parts[1] contains the missing table names (comma-separated)
                    tables_raw = parts[1]
                    for tbl in tables_raw.split(","):
                        tbl = tbl.strip()
                        # Clean up: remove parenthetical notes, trailing text
                        if "(" in tbl:
                            tbl = tbl.split("(")[0].strip()
                        if tbl and not tbl.startswith("-"):
                            missing[f"{dataset_raw}.{tbl}"] = f"from {filepath}"
        i += 1
    log.info("Parsed %d missing table references from MD", len(missing))
    return missing if missing else None
 def compute_missing_tables(bq_tables, s3_tables, md_missing):
    """Compute which tables need to be synced."""
    if md_missing is None:
        log.info("No MD file, computing diff: BQ - S3")
        return [
            (table_id, info)
            for table_id, info in bq_tables.items()
            if table_id not in s3_tables
        ]
    log.info("Computing sync targets: MD missing tables not in S3")
    targets = []
    for key, info in bq_tables.items():
        if key in s3_tables:
            continue
        if key in md_missing:
            targets.append((key, info))
        else:
            # Table not in S3 but not in MD missing list
            # Check if its dataset is partially covered
            dataset = info["dataset"]
            table = info["table"]
            # If any table from this dataset is in MD missing, include it
            dataset_in_md = any(
                k.startswith(f"{dataset}.") and k.split(".", 1)[1] in md_missing
                for k in bq_tables
            )
            if not dataset_in_md:
                targets.append((key, info))
    return targets
 def estimate_size_mb(num_bytes):
    """Estimate size in MB."""
    if num_bytes is None:
        return "?"
    return f"{num_bytes / 1_048_576:.1f}"
 # ---------------------------------------------------------------------------
 # Export logic
 # ---------------------------------------------------------------------------
 def sync_table(args, table_id, info, dry_run=False):
    """Sync a single table: BQ → parquet → S3 → DuckDB view."""
    bq_client, s3_client, bucket = args
    dataset = info["dataset"]
    table = info["table"]
    full_id = info["full_id"]
    s3_key_prefix = f"{dataset}/{table}"
    if dry_run:
        size_mb = estimate_size_mb(info.get("num_bytes"))
        return True, f"[DRY] {dataset}.{table} (~{size_mb} MB)"
    # Step 1: Query from BigQuery
    log.info("Querying %s from BigQuery", full_id)
    query = f"SELECT * FROM `{full_id}`"
    try:
        query_job = bq_client.query(query, location="US")
        df = query_job.to_dataframe()
    except Exception as e:
        return False, f"BQ query failed for {table_id}: {e}"
    if df.empty:
        return True, f"[SKIP] {table_id} — empty table"
    if df.shape[0] > 10_000_000:
        log.warning("Table %s has %d rows — may be slow/memory-intensive", table_id, df.shape[0])
    # Step 2: Write to parquet in memory, then upload
    import io
    import pyarrow as pa
    import pyarrow.parquet as pq
    buffer = io.BytesIO()
    table_pa = pa.Table.from_pandas(df)
    # Write with zstd compression
    writer = pq.ParquetWriter(
        buffer,
        table_pa.schema,
        compression="zstd",
        use_dictionary=True,
    )
    writer.write_table(table_pa)
    writer.close()
    buffer.seek(0)
    s3_key = f"{s3_key_prefix}/{table}.parquet"
    log.info("Uploading %s → s3://%s/%s (%s, %d rows)",
             table_id, bucket, s3_key,
             f"{buffer.getbuffer().nbytes / 1_048_576:.1f} MB",
             df.shape[0])
    try:
        s3_client.upload_fileobj(
            buffer,
            bucket,
            s3_key,
            ExtraArgs={"ContentType": "application/octet-stream"},
        )
    except Exception as e:
        return False, f"S3 upload failed for {table_id}: {e}"
    log.info("[DONE] %s uploaded to s3://%s/%s", table_id, bucket, s3_key)
    return True, f"[DONE] {table_id}"
 def update_duckdb_view(env, table_id, info):
    """Register a new table as a DuckDB view over S3 parquet."""
    import duckdb
    dataset = info["dataset"]
    table = info["table"]
    bucket = env["HETZNER_S3_BUCKET"]
    endpoint = env["HETZNER_S3_ENDPOINT"].removeprefix("https://").removeprefix("http://")
    access_key = env["AWS_ACCESS_KEY_ID"]
    secret_key = env["AWS_SECRET_ACCESS_KEY"]
    # S3 path
    s3_path = f"s3://{bucket}/{dataset}/{table}/{table}.parquet"
    try:
        con = duckdb.connect("basedosdados.duckdb", read_only=False)
        con.execute("INSTALL httpfs; LOAD httpfs;")
        con.execute(f"SET s3_endpoint='{endpoint}';")
        con.execute(f"SET s3_access_key_id='{access_key}';")
        con.execute(f"SET s3_secret_access_key='{secret_key}';")
        con.execute(f"SET s3_url_style='path';")
        con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}")
        con.execute(f"""
            CREATE OR REPLACE VIEW {dataset}.{table} AS
            SELECT * FROM read_parquet('{s3_path}', hive_partitioning=true, union_by_name=true)
        """)
        con.close()
        log.info("[DUCKDB] View created: %s.%s", dataset, table)
        return True, None
    except Exception as e:
        log.error("[DUCKDB] Failed to create view %s.%s: %s", dataset, table, e)
        return False, str(e)
 def run_sync(targets, args, env, dry_run=False, resume=False):
    """Run the sync for all target tables."""
    s3_client = get_s3_client(env)
    bq_client = get_bq_client()
    # Load done/failed tracking
    done_set = set()
    if resume:
        if os.path.exists(DONE_FILE):
            with open(DONE_FILE) as f:
                done_set = {l.strip() for l in f if l.strip()}
            log.info("Resuming: %d tables already done", len(done_set))
    failed_count = 0
    done_count = 0
    # Filter out already-done tables
    targets = [(tid, info) for tid, info in targets if tid not in done_set]
    if not targets:
        log.info("No tables to sync.")
        return 0, 0
    log.info("Syncing %d tables...", len(targets))
    for i, (table_id, info) in enumerate(targets, 1):
        log.info("--- [%d/%d] Syncing %s ---", i, len(targets), table_id)
        # Sync BQ → S3
        ok, msg = sync_table(
            (bq_client, s3_client, env["HETZNER_S3_BUCKET"]),
            table_id,
            info,
            dry_run=dry_run,
        )
        log.info(msg)
        if dry_run:
            continue
        if not ok:
            with open(FAILED_FILE, "a") as f:
                f.write(f"{table_id}\t{msg}\n")
            failed_count += 1
            continue
        if "empty" in msg.lower():
            continue
        # Update DuckDB view
        ok, err = update_duckdb_view(env, table_id, info)
        if not ok:
            with open(FAILED_FILE, "a") as f:
                f.write(f"{table_id}\tDUCKDB: {err}\n")
        # Mark done
        with open(DONE_FILE, "a") as f:
            f.write(f"{table_id}\n")
        done_count += 1
    return done_count, failed_count
 # ---------------------------------------------------------------------------
 # Main
 # ---------------------------------------------------------------------------
 def main():
    parser = argparse.ArgumentParser(description="Sync missing BQ tables to S3")
    parser.add_argument("--dry-run", action="store_true", help="List tables without syncing")
    parser.add_argument("--resume", action="store_true", help="Resume from last run")
    args = parser.parse_args()
    env = load_env()
    dry_run = args.dry_run
    if dry_run:
        log.info("=== DRY RUN MODE ===")
    # Step 1: List BigQuery tables
    bq_client = get_bq_client()
    bq_tables = list_bq_tables(bq_client)
    # Step 2: List S3 tables
    s3_client = get_s3_client(env)
    s3_tables = list_s3_tables(s3_client, env["HETZNER_S3_BUCKET"])
    # Step 3: Parse missing tables from MD
    md_missing = parse_missing_tables_from_md(MISSING_TABLES_FILE)
    # Step 4: Compute targets
    targets = compute_missing_tables(bq_tables, s3_tables, md_missing)
    if not targets:
        log.info("No tables to sync.")
        return
    log.info("")
    log.info("============================================")
    log.info(" Tables to sync: %d", len(targets))
    log.info("============================================")
    for i, (table_id, info) in enumerate(targets, 1):
        size_mb = estimate_size_mb(info.get("num_bytes"))
        md_note = md_missing.get(table_id, "")
        log.info("  [%d] %-50s %6s MB  %s", i, table_id, size_mb, md_note)
    log.info("")
    if dry_run:
        total_bytes = sum(info.get("num_bytes", 0) or 0 for _, info in targets)
        total_gb = total_bytes / 1_073_741_824
        log.info("Total estimated size: %.2f GB (BigQuery compressed bytes)", total_gb)
        log.info("Run without --dry-run to start syncing.")
        return
    # Step 5: Run sync
    log.info("Starting sync...")
    done_count, failed_count = run_sync(targets, None, env, dry_run=False, resume=args.resume)
    log.info("")
    log.info("============================================")
    log.info(" Sync complete!")
    log.info(" Done:    %d tables", done_count)
    log.info(" Failed:  %d tables", failed_count)
    log.info(" Log:     %s", LOG_FILE)
    log.info("============================================")
    if failed_count > 0:
        log.info("Failed tables: see %s", FAILED_FILE)
        sys.exit(1)
 if __name__ == "__main__":
    main()