add DuckDB explorer: creates views over S3 parquets for local querying
This commit is contained in:
62
prepara_gui.py
Normal file
62
prepara_gui.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
import os
|
||||||
|
import duckdb
|
||||||
|
import boto3
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
S3_ENDPOINT = os.environ["HETZNER_S3_ENDPOINT"] # https://hel1.your-objectstorage.com
|
||||||
|
S3_BUCKET = os.environ["HETZNER_S3_BUCKET"] # baseldosdados
|
||||||
|
ACCESS_KEY = os.environ["AWS_ACCESS_KEY_ID"]
|
||||||
|
SECRET_KEY = os.environ["AWS_SECRET_ACCESS_KEY"]
|
||||||
|
|
||||||
|
# Strip protocol for DuckDB httpfs (expects bare hostname)
|
||||||
|
s3_host = S3_ENDPOINT.removeprefix("https://").removeprefix("http://")
|
||||||
|
|
||||||
|
con = duckdb.connect('basedosdados.duckdb')
|
||||||
|
|
||||||
|
con.execute("INSTALL httpfs; LOAD httpfs;")
|
||||||
|
con.execute(f"""
|
||||||
|
CREATE OR REPLACE PERSISTENT SECRET hetzner (
|
||||||
|
TYPE S3,
|
||||||
|
KEY_ID '{ACCESS_KEY}',
|
||||||
|
SECRET '{SECRET_KEY}',
|
||||||
|
ENDPOINT '{s3_host}',
|
||||||
|
URL_STYLE 'path'
|
||||||
|
);
|
||||||
|
""")
|
||||||
|
|
||||||
|
# List all dataset/table prefixes in the bucket
|
||||||
|
s3 = boto3.client(
|
||||||
|
's3',
|
||||||
|
endpoint_url=S3_ENDPOINT,
|
||||||
|
aws_access_key_id=ACCESS_KEY,
|
||||||
|
aws_secret_access_key=SECRET_KEY,
|
||||||
|
)
|
||||||
|
paginator = s3.get_paginator('list_objects_v2')
|
||||||
|
|
||||||
|
datasets = {}
|
||||||
|
for page in paginator.paginate(Bucket=S3_BUCKET, Delimiter='/'):
|
||||||
|
for prefix in page.get('CommonPrefixes', []):
|
||||||
|
dataset = prefix['Prefix'].rstrip('/')
|
||||||
|
datasets[dataset] = []
|
||||||
|
for page2 in paginator.paginate(Bucket=S3_BUCKET,
|
||||||
|
Prefix=dataset + '/',
|
||||||
|
Delimiter='/'):
|
||||||
|
for p in page2.get('CommonPrefixes', []):
|
||||||
|
table = p['Prefix'].rstrip('/').split('/')[-1]
|
||||||
|
datasets[dataset].append(table)
|
||||||
|
|
||||||
|
# Create schemas and views
|
||||||
|
for dataset, tables in datasets.items():
|
||||||
|
con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}")
|
||||||
|
for table in tables:
|
||||||
|
path = f"s3://{S3_BUCKET}/{dataset}/{table}/*.parquet"
|
||||||
|
con.execute(f"""
|
||||||
|
CREATE OR REPLACE VIEW {dataset}.{table} AS
|
||||||
|
SELECT * FROM '{path}'
|
||||||
|
""")
|
||||||
|
print(f"✓ {dataset}.{table}")
|
||||||
|
|
||||||
|
con.close()
|
||||||
|
print("Done! Open with: duckdb --ui basedosdados.duckdb")
|
||||||
Reference in New Issue
Block a user