replace duckdb-ui with ttyd shell: add /query HTTP endpoint, fix utf-8/locale, region config

- swap DuckDB UI for ttyd web terminal (--writable, -readonly db) - add POST /query endpoint with X-Password auth for curl-based SQL execution - fix UTF-8 rendering: set LANG/LC_ALL=C.UTF-8 in container - pass BUCKET_REGION env var for correct S3 signing region - simplify start.sh: drop Xvfb, views.duckdb generation, blocking duckdb -ui - add less, ncurses-bin to Dockerfile for proper pager/terminal support - update Caddyfile: single route to ttyd with flush_interval -1 for websocket - update README to reflect current architecture and document /query usage - remove duckdb-ui.service, schemas.json, file_tree.md (generated artifacts)
2026-03-26 11:54:46 +01:00
parent cd94603fac
commit 41e7f7a972
12 changed files with 251 additions and 148583 deletions
--- a/prepara_db.py
+++ b/prepara_db.py
@@ -1,6 +1,7 @@
 import os
 import duckdb
 import boto3
+from collections import defaultdict
 from dotenv import load_dotenv

 load_dotenv()
@@ -13,46 +14,51 @@ SECRET_KEY   = os.environ['AWS_SECRET_ACCESS_KEY']
 # DuckDB expects the endpoint without scheme
 s3_endpoint = ENDPOINT_URL.removeprefix('https://').removeprefix('http://')

-# Lista todos os prefixos no bucket (dataset/tabela)
+# Lista todos os objetos do bucket de uma vez, agrupando por dataset/tabela
 s3 = boto3.client('s3',
                  endpoint_url=ENDPOINT_URL,
                  aws_access_key_id=ACCESS_KEY,
                  aws_secret_access_key=SECRET_KEY)
 paginator = s3.get_paginator('list_objects_v2')

-datasets = {}
-for page in paginator.paginate(Bucket=BUCKET, Delimiter='/'):
-    for prefix in page.get('CommonPrefixes', []):
-        dataset = prefix['Prefix'].rstrip('/')
-        datasets[dataset] = []
-        for page2 in paginator.paginate(Bucket=BUCKET,
-                                         Prefix=dataset+'/',
-                                         Delimiter='/'):
-            for p in page2.get('CommonPrefixes', []):
-                table = p['Prefix'].rstrip('/').split('/')[-1]
-                datasets[dataset].append(table)
+table_files = defaultdict(lambda: defaultdict(list))
+for page in paginator.paginate(Bucket=BUCKET):
+    for obj in page.get('Contents', []):
+        key = obj['Key']
+        if not key.endswith('.parquet'):
+            continue
+        parts = key.split('/')
+        if len(parts) >= 3:
+            dataset, table = parts[0], parts[1]
+            table_files[dataset][table].append(f"s3://{BUCKET}/{key}")

 # Cria conexão DuckDB e configura S3
-con = duckdb.connect('basedosdados3.duckdb')
+con = duckdb.connect('basedosdados.duckdb')
 con.execute("INSTALL httpfs; LOAD httpfs;")
 con.execute(f"""
    SET s3_endpoint='{s3_endpoint}';
    SET s3_access_key_id='{ACCESS_KEY}';
    SET s3_secret_access_key='{SECRET_KEY}';
    SET s3_url_style='path';
+    SET enable_object_cache=true;
+    SET threads=4;
+    SET memory_limit='6GB';
+    SET preserve_insertion_order=false;
+    SET http_keep_alive=true;
+    SET http_retries=3;
 """)

-# Cria schemas e views
-for dataset, tables in datasets.items():
+# Cria schemas e views com lista explícita de arquivos
+for dataset, tables in table_files.items():
    con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}")
-    for table in tables:
-        path = f"s3://{BUCKET}/{dataset}/{table}/*.parquet"
+    for table, files in tables.items():
+        file_list = ", ".join(f"'{f}'" for f in sorted(files))
        try:
            con.execute(f"""
                CREATE OR REPLACE VIEW {dataset}.{table} AS
-                SELECT * FROM read_parquet('{path}', hive_partitioning=true)
+                SELECT * FROM read_parquet([{file_list}], hive_partitioning=true, union_by_name=true)
            """)
-            print(f"✓ {dataset}.{table}")
+            print(f"✓ {dataset}.{table} ({len(files)} files)")
        except Exception as e:
            if 'Geoparquet' in str(e) or 'geometria' in str(e) or 'geometry' in str(e).lower():
                print(f"  skip (geoparquet) {dataset}.{table}")
@@ -60,4 +66,4 @@ for dataset, tables in datasets.items():
                raise

 con.close()
-print("Done! Open with: duckdb --ui basedosdados3.duckdb")
+print("Done! Open with: duckdb --ui basedosdados.duckdb")