Persistent DuckDB connection in auth.py for faster queries

Replace per-request subprocess spawning with a single long-lived duckdb
Python connection (in-memory + ATTACH read-only). LOAD httpfs and S3 auth
are paid once at startup; object cache accumulates across requests.

Benchmarked improvement on remote: Q1 10x, Q2 3x, Q3 9x, Q4 22x faster.
Add duckdb==1.5.1 Python package to Dockerfile.
This commit is contained in:
2026-05-17 11:19:01 +02:00
parent 86a1669902
commit d539736afc
2 changed files with 39 additions and 22 deletions

View File

@@ -38,7 +38,7 @@ FROM --platform=linux/amd64 debian:12-slim
RUN apt-get update -qq && \ RUN apt-get update -qq && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
curl ca-certificates unzip bsdmainutils python3 \ curl ca-certificates unzip bsdmainutils python3 python3-pip \
less ncurses-bin && \ less ncurses-bin && \
curl -fsSL \ curl -fsSL \
"https://github.com/caddyserver/caddy/releases/download/v2.9.1/caddy_2.9.1_linux_amd64.tar.gz" \ "https://github.com/caddyserver/caddy/releases/download/v2.9.1/caddy_2.9.1_linux_amd64.tar.gz" \
@@ -59,7 +59,8 @@ RUN apt-get update -qq && \
cp /usr/local/libduckdb.so /usr/local/lib/ && \ cp /usr/local/libduckdb.so /usr/local/lib/ && \
ldconfig && \ ldconfig && \
rm /tmp/libduckdb.zip && \ rm /tmp/libduckdb.zip && \
apt-get clean && rm -rf /var/lib/apt/lists/* apt-get clean && rm -rf /var/lib/apt/lists/* && \
pip3 install --no-cache-dir --break-system-packages duckdb==1.5.1
WORKDIR /app WORKDIR /app

56
auth.py
View File

@@ -1,33 +1,48 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Minimal cookie-session auth gate for DuckDB shell.""" """Minimal cookie-session auth gate for DuckDB shell."""
import hmac, hashlib, json, os, secrets, subprocess, time import decimal, datetime, duckdb, hmac, hashlib, json, os, secrets, threading, time
from http.server import HTTPServer, BaseHTTPRequestHandler from http.server import HTTPServer, BaseHTTPRequestHandler
from socketserver import ThreadingMixIn from socketserver import ThreadingMixIn
from urllib.parse import parse_qs from urllib.parse import parse_qs
PASSWORD = os.environ.get('BASIC_AUTH_PASSWORD', '').encode() PASSWORD = os.environ.get('BASIC_AUTH_PASSWORD', '').encode()
_INIT_SQL = None _con = None
_lock = threading.Lock()
def _init_db():
global _con
endpoint = os.environ.get('HETZNER_S3_ENDPOINT', '').removeprefix('https://').removeprefix('http://')
_con = duckdb.connect(':memory:')
_con.execute("INSTALL httpfs; LOAD httpfs;")
_con.execute(f"""
SET s3_endpoint='{endpoint}';
SET s3_access_key_id='{os.environ.get("AWS_ACCESS_KEY_ID", "")}';
SET s3_secret_access_key='{os.environ.get("AWS_SECRET_ACCESS_KEY", "")}';
SET s3_region='{os.environ.get("BUCKET_REGION", "")}';
SET s3_url_style='path';
SET enable_object_cache=true;
SET threads=4;
SET memory_limit='4GB';
""")
_con.execute("ATTACH '/app/data/basedosdados.duckdb' AS basedosdados (READ_ONLY)")
def _json_default(obj):
if isinstance(obj, decimal.Decimal): return float(obj)
if isinstance(obj, (datetime.date, datetime.datetime)): return obj.isoformat()
return str(obj)
def _run_query(sql, json_mode=True): def _run_query(sql, json_mode=True):
global _INIT_SQL with _lock:
if _INIT_SQL is None: try:
with open('/app/ssh_init.sql') as f: rel = _con.execute(sql)
_INIT_SQL = f.read() cols = [d[0] for d in rel.description]
if json_mode: rows = [{cols[i]: row[i] for i in range(len(cols))} for row in rel.fetchall()]
sql = '.mode json\n' + sql return json.dumps(rows, default=_json_default).encode()
try: except Exception as e:
r = subprocess.run( return json.dumps({'error': str(e)}).encode()
['duckdb', '-readonly', '/app/data/basedosdados.duckdb'],
input=_INIT_SQL + '\n' + sql, capture_output=True, text=True, timeout=120 _SECRET = secrets.token_bytes(32)
)
if r.stdout.strip().startswith('['):
return r.stdout.encode()
err = (r.stderr or r.stdout or 'unknown DuckDB error').strip()
return json.dumps({'error': err}).encode()
except subprocess.TimeoutExpired:
return json.dumps({'error': 'query timed out after 120s'}).encode()
_SECRET = secrets.token_bytes(32)
def _make_token(): def _make_token():
day = str(int(time.time()) // 86400) day = str(int(time.time()) // 86400)
@@ -126,4 +141,5 @@ class H(BaseHTTPRequestHandler):
class ThreadedHTTPServer(ThreadingMixIn, HTTPServer): class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
daemon_threads = True daemon_threads = True
_init_db()
ThreadedHTTPServer(('127.0.0.1', 8081), H).serve_forever() ThreadedHTTPServer(('127.0.0.1', 8081), H).serve_forever()