From 9eb2dee013ffdc934bc3c3b92139dd9e7a32d533 Mon Sep 17 00:00:00 2001 From: rafapolo Date: Wed, 25 Mar 2026 13:23:59 +0100 Subject: [PATCH] containerize with Haloy: Dockerfile, Caddy basicauth, haloy.yml for db.xn--2dk.xyz MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Dockerfile: debian slim, installs DuckDB CLI, Python deps, Caddy - start.sh: runs prepara_db.py → starts Caddy (basicauth) → starts DuckDB UI - Caddyfile: updated for container (no TLS, port 8080, Haloy handles HTTPS) - haloy.yml: deploys to db.xn--2dk.xyz on port 8080 - requirements.txt: duckdb, boto3, python-dotenv - prepara_db.py, open_gui.sh, duckdb-ui.service: add previously untracked files - remove prepara_gui.py (replaced by prepara_db.py) --- Caddyfile | 8 ++++++ Dockerfile | 31 +++++++++++++++++++++++ README.md | 45 +++++++++++++++++++++++++++++++++ duckdb-ui.service | 16 ++++++++++++ haloy.yml | 10 ++++++++ open_gui.sh | 6 +++++ prepara_db.py | 63 +++++++++++++++++++++++++++++++++++++++++++++++ prepara_gui.py | 62 ---------------------------------------------- requirements.txt | 3 +++ start.sh | 11 +++++++++ 10 files changed, 193 insertions(+), 62 deletions(-) create mode 100644 Caddyfile create mode 100644 Dockerfile create mode 100644 duckdb-ui.service create mode 100644 haloy.yml create mode 100755 open_gui.sh create mode 100644 prepara_db.py delete mode 100644 prepara_gui.py create mode 100644 requirements.txt create mode 100644 start.sh diff --git a/Caddyfile b/Caddyfile new file mode 100644 index 0000000..417319a --- /dev/null +++ b/Caddyfile @@ -0,0 +1,8 @@ +:8080 { + basicauth /* { + # user: admin | pwd: 2/e+h "$INIT" +duckdb --ui ui.duckdb -init "$INIT" +rm -f "$INIT" diff --git a/prepara_db.py b/prepara_db.py new file mode 100644 index 0000000..050473a --- /dev/null +++ b/prepara_db.py @@ -0,0 +1,63 @@ +import os +import duckdb +import boto3 +from dotenv import load_dotenv + +load_dotenv() + +BUCKET = os.environ['HETZNER_S3_BUCKET'] +ENDPOINT_URL = os.environ['HETZNER_S3_ENDPOINT'] +ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID'] +SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY'] + +# DuckDB expects the endpoint without scheme +s3_endpoint = ENDPOINT_URL.removeprefix('https://').removeprefix('http://') + +# Lista todos os prefixos no bucket (dataset/tabela) +s3 = boto3.client('s3', + endpoint_url=ENDPOINT_URL, + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=SECRET_KEY) +paginator = s3.get_paginator('list_objects_v2') + +datasets = {} +for page in paginator.paginate(Bucket=BUCKET, Delimiter='/'): + for prefix in page.get('CommonPrefixes', []): + dataset = prefix['Prefix'].rstrip('/') + datasets[dataset] = [] + for page2 in paginator.paginate(Bucket=BUCKET, + Prefix=dataset+'/', + Delimiter='/'): + for p in page2.get('CommonPrefixes', []): + table = p['Prefix'].rstrip('/').split('/')[-1] + datasets[dataset].append(table) + +# Cria conexão DuckDB e configura S3 +con = duckdb.connect('basedosdados3.duckdb') +con.execute("INSTALL httpfs; LOAD httpfs;") +con.execute(f""" + SET s3_endpoint='{s3_endpoint}'; + SET s3_access_key_id='{ACCESS_KEY}'; + SET s3_secret_access_key='{SECRET_KEY}'; + SET s3_url_style='path'; +""") + +# Cria schemas e views +for dataset, tables in datasets.items(): + con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}") + for table in tables: + path = f"s3://{BUCKET}/{dataset}/{table}/*.parquet" + try: + con.execute(f""" + CREATE OR REPLACE VIEW {dataset}.{table} AS + SELECT * FROM read_parquet('{path}', hive_partitioning=true) + """) + print(f"✓ {dataset}.{table}") + except Exception as e: + if 'Geoparquet' in str(e) or 'geometria' in str(e) or 'geometry' in str(e).lower(): + print(f" skip (geoparquet) {dataset}.{table}") + else: + raise + +con.close() +print("Done! Open with: duckdb --ui basedosdados3.duckdb") diff --git a/prepara_gui.py b/prepara_gui.py deleted file mode 100644 index ad94638..0000000 --- a/prepara_gui.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -import duckdb -import boto3 -from dotenv import load_dotenv - -load_dotenv() - -S3_ENDPOINT = os.environ["HETZNER_S3_ENDPOINT"] # https://hel1.your-objectstorage.com -S3_BUCKET = os.environ["HETZNER_S3_BUCKET"] # baseldosdados -ACCESS_KEY = os.environ["AWS_ACCESS_KEY_ID"] -SECRET_KEY = os.environ["AWS_SECRET_ACCESS_KEY"] - -# Strip protocol for DuckDB httpfs (expects bare hostname) -s3_host = S3_ENDPOINT.removeprefix("https://").removeprefix("http://") - -con = duckdb.connect('basedosdados.duckdb') - -con.execute("INSTALL httpfs; LOAD httpfs;") -con.execute(f""" - CREATE OR REPLACE PERSISTENT SECRET hetzner ( - TYPE S3, - KEY_ID '{ACCESS_KEY}', - SECRET '{SECRET_KEY}', - ENDPOINT '{s3_host}', - URL_STYLE 'path' - ); -""") - -# List all dataset/table prefixes in the bucket -s3 = boto3.client( - 's3', - endpoint_url=S3_ENDPOINT, - aws_access_key_id=ACCESS_KEY, - aws_secret_access_key=SECRET_KEY, -) -paginator = s3.get_paginator('list_objects_v2') - -datasets = {} -for page in paginator.paginate(Bucket=S3_BUCKET, Delimiter='/'): - for prefix in page.get('CommonPrefixes', []): - dataset = prefix['Prefix'].rstrip('/') - datasets[dataset] = [] - for page2 in paginator.paginate(Bucket=S3_BUCKET, - Prefix=dataset + '/', - Delimiter='/'): - for p in page2.get('CommonPrefixes', []): - table = p['Prefix'].rstrip('/').split('/')[-1] - datasets[dataset].append(table) - -# Create schemas and views -for dataset, tables in datasets.items(): - con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}") - for table in tables: - path = f"s3://{S3_BUCKET}/{dataset}/{table}/*.parquet" - con.execute(f""" - CREATE OR REPLACE VIEW {dataset}.{table} AS - SELECT * FROM '{path}' - """) - print(f"✓ {dataset}.{table}") - -con.close() -print("Done! Open with: duckdb --ui basedosdados.duckdb") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..acb4c9c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +duckdb +boto3 +python-dotenv diff --git a/start.sh b/start.sh new file mode 100644 index 0000000..ee8a6ce --- /dev/null +++ b/start.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -euo pipefail + +echo "[start] Building DuckDB views from S3..." +python3 prepara_db.py + +echo "[start] Starting Caddy..." +caddy start --config /app/Caddyfile --adapter caddyfile + +echo "[start] Starting DuckDB UI on :4213..." +exec duckdb --ui basedosdados3.duckdb