Persistent DuckDB connection in auth.py for faster queries
Replace per-request subprocess spawning with a single long-lived duckdb Python connection (in-memory + ATTACH read-only). LOAD httpfs and S3 auth are paid once at startup; object cache accumulates across requests. Benchmarked improvement on remote: Q1 10x, Q2 3x, Q3 9x, Q4 22x faster. Add duckdb==1.5.1 Python package to Dockerfile.
This commit is contained in:
@@ -38,7 +38,7 @@ FROM --platform=linux/amd64 debian:12-slim
|
|||||||
|
|
||||||
RUN apt-get update -qq && \
|
RUN apt-get update -qq && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
curl ca-certificates unzip bsdmainutils python3 \
|
curl ca-certificates unzip bsdmainutils python3 python3-pip \
|
||||||
less ncurses-bin && \
|
less ncurses-bin && \
|
||||||
curl -fsSL \
|
curl -fsSL \
|
||||||
"https://github.com/caddyserver/caddy/releases/download/v2.9.1/caddy_2.9.1_linux_amd64.tar.gz" \
|
"https://github.com/caddyserver/caddy/releases/download/v2.9.1/caddy_2.9.1_linux_amd64.tar.gz" \
|
||||||
@@ -59,7 +59,8 @@ RUN apt-get update -qq && \
|
|||||||
cp /usr/local/libduckdb.so /usr/local/lib/ && \
|
cp /usr/local/libduckdb.so /usr/local/lib/ && \
|
||||||
ldconfig && \
|
ldconfig && \
|
||||||
rm /tmp/libduckdb.zip && \
|
rm /tmp/libduckdb.zip && \
|
||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/* && \
|
||||||
|
pip3 install --no-cache-dir --break-system-packages duckdb==1.5.1
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|||||||
52
auth.py
52
auth.py
@@ -1,32 +1,47 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Minimal cookie-session auth gate for DuckDB shell."""
|
"""Minimal cookie-session auth gate for DuckDB shell."""
|
||||||
import hmac, hashlib, json, os, secrets, subprocess, time
|
import decimal, datetime, duckdb, hmac, hashlib, json, os, secrets, threading, time
|
||||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||||
from socketserver import ThreadingMixIn
|
from socketserver import ThreadingMixIn
|
||||||
from urllib.parse import parse_qs
|
from urllib.parse import parse_qs
|
||||||
|
|
||||||
PASSWORD = os.environ.get('BASIC_AUTH_PASSWORD', '').encode()
|
PASSWORD = os.environ.get('BASIC_AUTH_PASSWORD', '').encode()
|
||||||
|
|
||||||
_INIT_SQL = None
|
_con = None
|
||||||
|
_lock = threading.Lock()
|
||||||
|
|
||||||
|
def _init_db():
|
||||||
|
global _con
|
||||||
|
endpoint = os.environ.get('HETZNER_S3_ENDPOINT', '').removeprefix('https://').removeprefix('http://')
|
||||||
|
_con = duckdb.connect(':memory:')
|
||||||
|
_con.execute("INSTALL httpfs; LOAD httpfs;")
|
||||||
|
_con.execute(f"""
|
||||||
|
SET s3_endpoint='{endpoint}';
|
||||||
|
SET s3_access_key_id='{os.environ.get("AWS_ACCESS_KEY_ID", "")}';
|
||||||
|
SET s3_secret_access_key='{os.environ.get("AWS_SECRET_ACCESS_KEY", "")}';
|
||||||
|
SET s3_region='{os.environ.get("BUCKET_REGION", "")}';
|
||||||
|
SET s3_url_style='path';
|
||||||
|
SET enable_object_cache=true;
|
||||||
|
SET threads=4;
|
||||||
|
SET memory_limit='4GB';
|
||||||
|
""")
|
||||||
|
_con.execute("ATTACH '/app/data/basedosdados.duckdb' AS basedosdados (READ_ONLY)")
|
||||||
|
|
||||||
|
def _json_default(obj):
|
||||||
|
if isinstance(obj, decimal.Decimal): return float(obj)
|
||||||
|
if isinstance(obj, (datetime.date, datetime.datetime)): return obj.isoformat()
|
||||||
|
return str(obj)
|
||||||
|
|
||||||
def _run_query(sql, json_mode=True):
|
def _run_query(sql, json_mode=True):
|
||||||
global _INIT_SQL
|
with _lock:
|
||||||
if _INIT_SQL is None:
|
|
||||||
with open('/app/ssh_init.sql') as f:
|
|
||||||
_INIT_SQL = f.read()
|
|
||||||
if json_mode:
|
|
||||||
sql = '.mode json\n' + sql
|
|
||||||
try:
|
try:
|
||||||
r = subprocess.run(
|
rel = _con.execute(sql)
|
||||||
['duckdb', '-readonly', '/app/data/basedosdados.duckdb'],
|
cols = [d[0] for d in rel.description]
|
||||||
input=_INIT_SQL + '\n' + sql, capture_output=True, text=True, timeout=120
|
rows = [{cols[i]: row[i] for i in range(len(cols))} for row in rel.fetchall()]
|
||||||
)
|
return json.dumps(rows, default=_json_default).encode()
|
||||||
if r.stdout.strip().startswith('['):
|
except Exception as e:
|
||||||
return r.stdout.encode()
|
return json.dumps({'error': str(e)}).encode()
|
||||||
err = (r.stderr or r.stdout or 'unknown DuckDB error').strip()
|
|
||||||
return json.dumps({'error': err}).encode()
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
return json.dumps({'error': 'query timed out after 120s'}).encode()
|
|
||||||
_SECRET = secrets.token_bytes(32)
|
_SECRET = secrets.token_bytes(32)
|
||||||
|
|
||||||
def _make_token():
|
def _make_token():
|
||||||
@@ -126,4 +141,5 @@ class H(BaseHTTPRequestHandler):
|
|||||||
class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
|
class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
|
||||||
daemon_threads = True
|
daemon_threads = True
|
||||||
|
|
||||||
|
_init_db()
|
||||||
ThreadedHTTPServer(('127.0.0.1', 8081), H).serve_forever()
|
ThreadedHTTPServer(('127.0.0.1', 8081), H).serve_forever()
|
||||||
|
|||||||
Reference in New Issue
Block a user