- Dockerfile: debian slim, installs DuckDB CLI, Python deps, Caddy - start.sh: runs prepara_db.py → starts Caddy (basicauth) → starts DuckDB UI - Caddyfile: updated for container (no TLS, port 8080, Haloy handles HTTPS) - haloy.yml: deploys to db.xn--2dk.xyz on port 8080 - requirements.txt: duckdb, boto3, python-dotenv - prepara_db.py, open_gui.sh, duckdb-ui.service: add previously untracked files - remove prepara_gui.py (replaced by prepara_db.py)
64 lines
2.2 KiB
Python
64 lines
2.2 KiB
Python
import os
|
|
import duckdb
|
|
import boto3
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
BUCKET = os.environ['HETZNER_S3_BUCKET']
|
|
ENDPOINT_URL = os.environ['HETZNER_S3_ENDPOINT']
|
|
ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
|
|
SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
|
|
|
# DuckDB expects the endpoint without scheme
|
|
s3_endpoint = ENDPOINT_URL.removeprefix('https://').removeprefix('http://')
|
|
|
|
# Lista todos os prefixos no bucket (dataset/tabela)
|
|
s3 = boto3.client('s3',
|
|
endpoint_url=ENDPOINT_URL,
|
|
aws_access_key_id=ACCESS_KEY,
|
|
aws_secret_access_key=SECRET_KEY)
|
|
paginator = s3.get_paginator('list_objects_v2')
|
|
|
|
datasets = {}
|
|
for page in paginator.paginate(Bucket=BUCKET, Delimiter='/'):
|
|
for prefix in page.get('CommonPrefixes', []):
|
|
dataset = prefix['Prefix'].rstrip('/')
|
|
datasets[dataset] = []
|
|
for page2 in paginator.paginate(Bucket=BUCKET,
|
|
Prefix=dataset+'/',
|
|
Delimiter='/'):
|
|
for p in page2.get('CommonPrefixes', []):
|
|
table = p['Prefix'].rstrip('/').split('/')[-1]
|
|
datasets[dataset].append(table)
|
|
|
|
# Cria conexão DuckDB e configura S3
|
|
con = duckdb.connect('basedosdados3.duckdb')
|
|
con.execute("INSTALL httpfs; LOAD httpfs;")
|
|
con.execute(f"""
|
|
SET s3_endpoint='{s3_endpoint}';
|
|
SET s3_access_key_id='{ACCESS_KEY}';
|
|
SET s3_secret_access_key='{SECRET_KEY}';
|
|
SET s3_url_style='path';
|
|
""")
|
|
|
|
# Cria schemas e views
|
|
for dataset, tables in datasets.items():
|
|
con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}")
|
|
for table in tables:
|
|
path = f"s3://{BUCKET}/{dataset}/{table}/*.parquet"
|
|
try:
|
|
con.execute(f"""
|
|
CREATE OR REPLACE VIEW {dataset}.{table} AS
|
|
SELECT * FROM read_parquet('{path}', hive_partitioning=true)
|
|
""")
|
|
print(f"✓ {dataset}.{table}")
|
|
except Exception as e:
|
|
if 'Geoparquet' in str(e) or 'geometria' in str(e) or 'geometry' in str(e).lower():
|
|
print(f" skip (geoparquet) {dataset}.{table}")
|
|
else:
|
|
raise
|
|
|
|
con.close()
|
|
print("Done! Open with: duckdb --ui basedosdados3.duckdb")
|