- swap DuckDB UI for ttyd web terminal (--writable, -readonly db) - add POST /query endpoint with X-Password auth for curl-based SQL execution - fix UTF-8 rendering: set LANG/LC_ALL=C.UTF-8 in container - pass BUCKET_REGION env var for correct S3 signing region - simplify start.sh: drop Xvfb, views.duckdb generation, blocking duckdb -ui - add less, ncurses-bin to Dockerfile for proper pager/terminal support - update Caddyfile: single route to ttyd with flush_interval -1 for websocket - update README to reflect current architecture and document /query usage - remove duckdb-ui.service, schemas.json, file_tree.md (generated artifacts)
70 lines
2.4 KiB
Python
70 lines
2.4 KiB
Python
import os
|
|
import duckdb
|
|
import boto3
|
|
from collections import defaultdict
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
BUCKET = os.environ['HETZNER_S3_BUCKET']
|
|
ENDPOINT_URL = os.environ['HETZNER_S3_ENDPOINT']
|
|
ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
|
|
SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
|
|
|
# DuckDB expects the endpoint without scheme
|
|
s3_endpoint = ENDPOINT_URL.removeprefix('https://').removeprefix('http://')
|
|
|
|
# Lista todos os objetos do bucket de uma vez, agrupando por dataset/tabela
|
|
s3 = boto3.client('s3',
|
|
endpoint_url=ENDPOINT_URL,
|
|
aws_access_key_id=ACCESS_KEY,
|
|
aws_secret_access_key=SECRET_KEY)
|
|
paginator = s3.get_paginator('list_objects_v2')
|
|
|
|
table_files = defaultdict(lambda: defaultdict(list))
|
|
for page in paginator.paginate(Bucket=BUCKET):
|
|
for obj in page.get('Contents', []):
|
|
key = obj['Key']
|
|
if not key.endswith('.parquet'):
|
|
continue
|
|
parts = key.split('/')
|
|
if len(parts) >= 3:
|
|
dataset, table = parts[0], parts[1]
|
|
table_files[dataset][table].append(f"s3://{BUCKET}/{key}")
|
|
|
|
# Cria conexão DuckDB e configura S3
|
|
con = duckdb.connect('basedosdados.duckdb')
|
|
con.execute("INSTALL httpfs; LOAD httpfs;")
|
|
con.execute(f"""
|
|
SET s3_endpoint='{s3_endpoint}';
|
|
SET s3_access_key_id='{ACCESS_KEY}';
|
|
SET s3_secret_access_key='{SECRET_KEY}';
|
|
SET s3_url_style='path';
|
|
SET enable_object_cache=true;
|
|
SET threads=4;
|
|
SET memory_limit='6GB';
|
|
SET preserve_insertion_order=false;
|
|
SET http_keep_alive=true;
|
|
SET http_retries=3;
|
|
""")
|
|
|
|
# Cria schemas e views com lista explícita de arquivos
|
|
for dataset, tables in table_files.items():
|
|
con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}")
|
|
for table, files in tables.items():
|
|
file_list = ", ".join(f"'{f}'" for f in sorted(files))
|
|
try:
|
|
con.execute(f"""
|
|
CREATE OR REPLACE VIEW {dataset}.{table} AS
|
|
SELECT * FROM read_parquet([{file_list}], hive_partitioning=true, union_by_name=true)
|
|
""")
|
|
print(f"✓ {dataset}.{table} ({len(files)} files)")
|
|
except Exception as e:
|
|
if 'Geoparquet' in str(e) or 'geometria' in str(e) or 'geometry' in str(e).lower():
|
|
print(f" skip (geoparquet) {dataset}.{table}")
|
|
else:
|
|
raise
|
|
|
|
con.close()
|
|
print("Done! Open with: duckdb --ui basedosdados.duckdb")
|