From 4572fcb28e89f901dbb6a04cb0f7dd9cde563151 Mon Sep 17 00:00:00 2001 From: rafapolo Date: Wed, 25 Mar 2026 10:13:37 +0100 Subject: [PATCH] add DuckDB explorer: creates views over S3 parquets for local querying --- prepara_gui.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 prepara_gui.py diff --git a/prepara_gui.py b/prepara_gui.py new file mode 100644 index 0000000..ad94638 --- /dev/null +++ b/prepara_gui.py @@ -0,0 +1,62 @@ +import os +import duckdb +import boto3 +from dotenv import load_dotenv + +load_dotenv() + +S3_ENDPOINT = os.environ["HETZNER_S3_ENDPOINT"] # https://hel1.your-objectstorage.com +S3_BUCKET = os.environ["HETZNER_S3_BUCKET"] # baseldosdados +ACCESS_KEY = os.environ["AWS_ACCESS_KEY_ID"] +SECRET_KEY = os.environ["AWS_SECRET_ACCESS_KEY"] + +# Strip protocol for DuckDB httpfs (expects bare hostname) +s3_host = S3_ENDPOINT.removeprefix("https://").removeprefix("http://") + +con = duckdb.connect('basedosdados.duckdb') + +con.execute("INSTALL httpfs; LOAD httpfs;") +con.execute(f""" + CREATE OR REPLACE PERSISTENT SECRET hetzner ( + TYPE S3, + KEY_ID '{ACCESS_KEY}', + SECRET '{SECRET_KEY}', + ENDPOINT '{s3_host}', + URL_STYLE 'path' + ); +""") + +# List all dataset/table prefixes in the bucket +s3 = boto3.client( + 's3', + endpoint_url=S3_ENDPOINT, + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=SECRET_KEY, +) +paginator = s3.get_paginator('list_objects_v2') + +datasets = {} +for page in paginator.paginate(Bucket=S3_BUCKET, Delimiter='/'): + for prefix in page.get('CommonPrefixes', []): + dataset = prefix['Prefix'].rstrip('/') + datasets[dataset] = [] + for page2 in paginator.paginate(Bucket=S3_BUCKET, + Prefix=dataset + '/', + Delimiter='/'): + for p in page2.get('CommonPrefixes', []): + table = p['Prefix'].rstrip('/').split('/')[-1] + datasets[dataset].append(table) + +# Create schemas and views +for dataset, tables in datasets.items(): + con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}") + for table in tables: + path = f"s3://{S3_BUCKET}/{dataset}/{table}/*.parquet" + con.execute(f""" + CREATE OR REPLACE VIEW {dataset}.{table} AS + SELECT * FROM '{path}' + """) + print(f"✓ {dataset}.{table}") + +con.close() +print("Done! Open with: duckdb --ui basedosdados.duckdb")