From 9eb2dee013ffdc934bc3c3b92139dd9e7a32d533 Mon Sep 17 00:00:00 2001
From: rafapolo <rafael@extrapolo.com>
Date: Wed, 25 Mar 2026 13:23:59 +0100
Subject: [PATCH] containerize with Haloy: Dockerfile, Caddy basicauth,
 haloy.yml for db.xn--2dk.xyz
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Dockerfile: debian slim, installs DuckDB CLI, Python deps, Caddy
- start.sh: runs prepara_db.py → starts Caddy (basicauth) → starts DuckDB UI
- Caddyfile: updated for container (no TLS, port 8080, Haloy handles HTTPS)
- haloy.yml: deploys to db.xn--2dk.xyz on port 8080
- requirements.txt: duckdb, boto3, python-dotenv
- prepara_db.py, open_gui.sh, duckdb-ui.service: add previously untracked files
- remove prepara_gui.py (replaced by prepara_db.py)
---
 Caddyfile         |  8 ++++++
 Dockerfile        | 31 +++++++++++++++++++++++
 README.md         | 45 +++++++++++++++++++++++++++++++++
 duckdb-ui.service | 16 ++++++++++++
 haloy.yml         | 10 ++++++++
 open_gui.sh       |  6 +++++
 prepara_db.py     | 63 +++++++++++++++++++++++++++++++++++++++++++++++
 prepara_gui.py    | 62 ----------------------------------------------
 requirements.txt  |  3 +++
 start.sh          | 11 +++++++++
 10 files changed, 193 insertions(+), 62 deletions(-)
 create mode 100644 Caddyfile
 create mode 100644 Dockerfile
 create mode 100644 duckdb-ui.service
 create mode 100644 haloy.yml
 create mode 100755 open_gui.sh
 create mode 100644 prepara_db.py
 delete mode 100644 prepara_gui.py
 create mode 100644 requirements.txt
 create mode 100644 start.sh

diff --git a/Caddyfile b/Caddyfile
new file mode 100644
index 0000000..417319a
--- /dev/null
+++ b/Caddyfile
@@ -0,0 +1,8 @@
+:8080 {
+    basicauth /* {
+        # user: admin | pwd: 2/e+h<L9\V6;
+        # regenerate: htpasswd -nbB -C 10 admin NEWPWD | cut -d: -f2 | base64
+        admin JDJ5JDEwJHlaV2tLUzBQL2ZsSndBL2g4WDZBNk9NdEZtTnVqcThOOHZ2aXNGRVVMWHhJUDB0WHhNanZD
+    }
+    reverse_proxy localhost:4213
+}
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..f14c95e
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,31 @@
+FROM debian:12-slim
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# System deps + Caddy
+RUN apt-get update -qq && \
+    apt-get install -y --no-install-recommends \
+        python3 python3-pip python3-venv \
+        curl ca-certificates unzip && \
+    # Caddy
+    curl -fsSL https://caddyserver.com/install.sh | bash && \
+    # DuckDB CLI
+    curl -fsSL \
+        "https://github.com/duckdb/duckdb/releases/latest/download/duckdb_cli-linux-amd64.zip" \
+        -o /tmp/duckdb.zip && \
+    unzip /tmp/duckdb.zip -d /usr/local/bin && \
+    chmod +x /usr/local/bin/duckdb && \
+    rm /tmp/duckdb.zip && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt
+
+COPY prepara_db.py Caddyfile start.sh ./
+RUN chmod +x start.sh
+
+EXPOSE 8080
+
+ENTRYPOINT ["./start.sh"]
diff --git a/README.md b/README.md
index 7020286..e851a7e 100644
--- a/README.md
+++ b/README.md
@@ -101,6 +101,51 @@ duckdb --ui basedosdados.duckdb
 python gera_schemas.py  # gera schemas.json e file_tree.md (~21 MB de egress)
 ```
 
+## Servidor com UI protegida por senha
+
+Para expor o DuckDB UI num servidor com HTTPS e autenticação básica, use o [Caddy](https://caddyserver.com/) como reverse proxy.
+
+**Pré-requisitos no servidor:** `caddy`, `htpasswd` (pacote `apache2-utils`), `duckdb`
+
+**1. Instalar o serviço DuckDB UI**
+
+Edite `duckdb-ui.service` com o usuário e caminho corretos e copie para o systemd:
+
+```bash
+# edite User= e WorkingDirectory= e EnvironmentFile= no arquivo
+cp duckdb-ui.service /etc/systemd/system/
+systemctl daemon-reload
+systemctl enable --now duckdb-ui
+```
+
+**2. Configurar o Caddy**
+
+Edite `Caddyfile` substituindo `your.domain.com` pelo domínio real, depois:
+
+```bash
+cp Caddyfile /etc/caddy/Caddyfile
+systemctl reload caddy
+```
+
+O Caddy obtém o certificado TLS via Let's Encrypt automaticamente (portas 80 e 443 abertas no firewall).
+
+**Trocar a senha:**
+
+```bash
+htpasswd -nbB -C 10 admin NOVA_SENHA | cut -d: -f2 | base64
+# cole o resultado no Caddyfile no lugar do hash atual, depois:
+systemctl reload caddy
+```
+
+**Arquivos relevantes:**
+
+| Arquivo | Função |
+|---|---|
+| `Caddyfile` | Config do Caddy: HTTPS + basicauth → proxy para localhost:4213 |
+| `duckdb-ui.service` | Serviço systemd que sobe o DuckDB UI em background |
+
+---
+
 ### `--gcloud-run`
 
 Cria uma VM `e2-standard-4` Debian 12 em `us-central1-a`, copia o script e o `.env`, instala as dependências e executa via SSH. Variáveis opcionais:
diff --git a/duckdb-ui.service b/duckdb-ui.service
new file mode 100644
index 0000000..4d81cbb
--- /dev/null
+++ b/duckdb-ui.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=DuckDB UI - basedosdados explorer
+After=network.target
+
+[Service]
+Type=simple
+User=YOUR_USER
+WorkingDirectory=/path/to/baseldosdados
+ExecStartPre=/usr/bin/python3 prepara_gui.py
+ExecStart=/usr/bin/duckdb --ui basedosdados.duckdb
+Restart=on-failure
+RestartSec=5s
+EnvironmentFile=/path/to/baseldosdados/.env
+
+[Install]
+WantedBy=multi-user.target
diff --git a/haloy.yml b/haloy.yml
new file mode 100644
index 0000000..8708fcb
--- /dev/null
+++ b/haloy.yml
@@ -0,0 +1,10 @@
+name: basedosdados
+server: YOUR_SERVER_IP_OR_HOSTNAME
+domains:
+  - domain: db.xn--2dk.xyz
+port: 8080
+env:
+  - HETZNER_S3_BUCKET
+  - HETZNER_S3_ENDPOINT
+  - AWS_ACCESS_KEY_ID
+  - AWS_SECRET_ACCESS_KEY
diff --git a/open_gui.sh b/open_gui.sh
new file mode 100755
index 0000000..72d31c9
--- /dev/null
+++ b/open_gui.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+cd "$(dirname "$0")"
+INIT=$(mktemp /tmp/duckdb_init_XXXX)
+printf "LOAD httpfs;\nATTACH 'basedosdados.duckdb' AS bd (READ_ONLY);\n" > "$INIT"
+duckdb --ui ui.duckdb -init "$INIT"
+rm -f "$INIT"
diff --git a/prepara_db.py b/prepara_db.py
new file mode 100644
index 0000000..050473a
--- /dev/null
+++ b/prepara_db.py
@@ -0,0 +1,63 @@
+import os
+import duckdb
+import boto3
+from dotenv import load_dotenv
+
+load_dotenv()
+
+BUCKET       = os.environ['HETZNER_S3_BUCKET']
+ENDPOINT_URL = os.environ['HETZNER_S3_ENDPOINT']
+ACCESS_KEY   = os.environ['AWS_ACCESS_KEY_ID']
+SECRET_KEY   = os.environ['AWS_SECRET_ACCESS_KEY']
+
+# DuckDB expects the endpoint without scheme
+s3_endpoint = ENDPOINT_URL.removeprefix('https://').removeprefix('http://')
+
+# Lista todos os prefixos no bucket (dataset/tabela)
+s3 = boto3.client('s3',
+                  endpoint_url=ENDPOINT_URL,
+                  aws_access_key_id=ACCESS_KEY,
+                  aws_secret_access_key=SECRET_KEY)
+paginator = s3.get_paginator('list_objects_v2')
+
+datasets = {}
+for page in paginator.paginate(Bucket=BUCKET, Delimiter='/'):
+    for prefix in page.get('CommonPrefixes', []):
+        dataset = prefix['Prefix'].rstrip('/')
+        datasets[dataset] = []
+        for page2 in paginator.paginate(Bucket=BUCKET,
+                                         Prefix=dataset+'/',
+                                         Delimiter='/'):
+            for p in page2.get('CommonPrefixes', []):
+                table = p['Prefix'].rstrip('/').split('/')[-1]
+                datasets[dataset].append(table)
+
+# Cria conexão DuckDB e configura S3
+con = duckdb.connect('basedosdados3.duckdb')
+con.execute("INSTALL httpfs; LOAD httpfs;")
+con.execute(f"""
+    SET s3_endpoint='{s3_endpoint}';
+    SET s3_access_key_id='{ACCESS_KEY}';
+    SET s3_secret_access_key='{SECRET_KEY}';
+    SET s3_url_style='path';
+""")
+
+# Cria schemas e views
+for dataset, tables in datasets.items():
+    con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}")
+    for table in tables:
+        path = f"s3://{BUCKET}/{dataset}/{table}/*.parquet"
+        try:
+            con.execute(f"""
+                CREATE OR REPLACE VIEW {dataset}.{table} AS
+                SELECT * FROM read_parquet('{path}', hive_partitioning=true)
+            """)
+            print(f"✓ {dataset}.{table}")
+        except Exception as e:
+            if 'Geoparquet' in str(e) or 'geometria' in str(e) or 'geometry' in str(e).lower():
+                print(f"  skip (geoparquet) {dataset}.{table}")
+            else:
+                raise
+
+con.close()
+print("Done! Open with: duckdb --ui basedosdados3.duckdb")
diff --git a/prepara_gui.py b/prepara_gui.py
deleted file mode 100644
index ad94638..0000000
--- a/prepara_gui.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import os
-import duckdb
-import boto3
-from dotenv import load_dotenv
-
-load_dotenv()
-
-S3_ENDPOINT = os.environ["HETZNER_S3_ENDPOINT"]          # https://hel1.your-objectstorage.com
-S3_BUCKET   = os.environ["HETZNER_S3_BUCKET"]            # baseldosdados
-ACCESS_KEY  = os.environ["AWS_ACCESS_KEY_ID"]
-SECRET_KEY  = os.environ["AWS_SECRET_ACCESS_KEY"]
-
-# Strip protocol for DuckDB httpfs (expects bare hostname)
-s3_host = S3_ENDPOINT.removeprefix("https://").removeprefix("http://")
-
-con = duckdb.connect('basedosdados.duckdb')
-
-con.execute("INSTALL httpfs; LOAD httpfs;")
-con.execute(f"""
-    CREATE OR REPLACE PERSISTENT SECRET hetzner (
-        TYPE S3,
-        KEY_ID '{ACCESS_KEY}',
-        SECRET '{SECRET_KEY}',
-        ENDPOINT '{s3_host}',
-        URL_STYLE 'path'
-    );
-""")
-
-# List all dataset/table prefixes in the bucket
-s3 = boto3.client(
-    's3',
-    endpoint_url=S3_ENDPOINT,
-    aws_access_key_id=ACCESS_KEY,
-    aws_secret_access_key=SECRET_KEY,
-)
-paginator = s3.get_paginator('list_objects_v2')
-
-datasets = {}
-for page in paginator.paginate(Bucket=S3_BUCKET, Delimiter='/'):
-    for prefix in page.get('CommonPrefixes', []):
-        dataset = prefix['Prefix'].rstrip('/')
-        datasets[dataset] = []
-        for page2 in paginator.paginate(Bucket=S3_BUCKET,
-                                        Prefix=dataset + '/',
-                                        Delimiter='/'):
-            for p in page2.get('CommonPrefixes', []):
-                table = p['Prefix'].rstrip('/').split('/')[-1]
-                datasets[dataset].append(table)
-
-# Create schemas and views
-for dataset, tables in datasets.items():
-    con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}")
-    for table in tables:
-        path = f"s3://{S3_BUCKET}/{dataset}/{table}/*.parquet"
-        con.execute(f"""
-            CREATE OR REPLACE VIEW {dataset}.{table} AS
-            SELECT * FROM '{path}'
-        """)
-        print(f"✓ {dataset}.{table}")
-
-con.close()
-print("Done! Open with: duckdb --ui basedosdados.duckdb")
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..acb4c9c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+duckdb
+boto3
+python-dotenv
diff --git a/start.sh b/start.sh
new file mode 100644
index 0000000..ee8a6ce
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -euo pipefail
+
+echo "[start] Building DuckDB views from S3..."
+python3 prepara_db.py
+
+echo "[start] Starting Caddy..."
+caddy start --config /app/Caddyfile --adapter caddyfile
+
+echo "[start] Starting DuckDB UI on :4213..."
+exec duckdb --ui basedosdados3.duckdb