refactor: reorganize project structure and fix broken references

- Move scripts to scripts/ directory (roda.sh, prepara_db.py, etc.) - Move shell config to shell/ directory (Caddyfile, auth.py, haloy.yml) - Move basedosdados.duckdb to data/ directory - Update Dockerfile and start.sh with new file paths - Update README.md with correct script paths - Remove Python ask.py (replaced by Rust binary in ask/ask) - Add Rust source files (schema_filter.rs, sql_generator.rs, table_selector.rs) - Remove sentence-transformer dependencies from ask - Move docs and context artifacts to their directories
2026-03-29 20:46:27 +02:00
parent 02cb13362c
commit ed5fa6756e
43 changed files with 302366 additions and 1093 deletions
--- a/scripts/prepara_db.py
+++ b/scripts/prepara_db.py
@@ -0,0 +1,69 @@
+import os
+import duckdb
+import boto3
+from collections import defaultdict
+from dotenv import load_dotenv
+
+load_dotenv()
+
+BUCKET       = os.environ['HETZNER_S3_BUCKET']
+ENDPOINT_URL = os.environ['HETZNER_S3_ENDPOINT']
+ACCESS_KEY   = os.environ['AWS_ACCESS_KEY_ID']
+SECRET_KEY   = os.environ['AWS_SECRET_ACCESS_KEY']
+
+# DuckDB expects the endpoint without scheme
+s3_endpoint = ENDPOINT_URL.removeprefix('https://').removeprefix('http://')
+
+# Lista todos os objetos do bucket de uma vez, agrupando por dataset/tabela
+s3 = boto3.client('s3',
+                  endpoint_url=ENDPOINT_URL,
+                  aws_access_key_id=ACCESS_KEY,
+                  aws_secret_access_key=SECRET_KEY)
+paginator = s3.get_paginator('list_objects_v2')
+
+table_files = defaultdict(lambda: defaultdict(list))
+for page in paginator.paginate(Bucket=BUCKET):
+    for obj in page.get('Contents', []):
+        key = obj['Key']
+        if not key.endswith('.parquet'):
+            continue
+        parts = key.split('/')
+        if len(parts) >= 3:
+            dataset, table = parts[0], parts[1]
+            table_files[dataset][table].append(f"s3://{BUCKET}/{key}")
+
+# Cria conexão DuckDB e configura S3
+con = duckdb.connect('basedosdados.duckdb')
+con.execute("INSTALL httpfs; LOAD httpfs;")
+con.execute(f"""
+    SET s3_endpoint='{s3_endpoint}';
+    SET s3_access_key_id='{ACCESS_KEY}';
+    SET s3_secret_access_key='{SECRET_KEY}';
+    SET s3_url_style='path';
+    SET enable_object_cache=true;
+    SET threads=4;
+    SET memory_limit='6GB';
+    SET preserve_insertion_order=false;
+    SET http_keep_alive=true;
+    SET http_retries=3;
+""")
+
+# Cria schemas e views com lista explícita de arquivos
+for dataset, tables in table_files.items():
+    con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}")
+    for table, files in tables.items():
+        file_list = ", ".join(f"'{f}'" for f in sorted(files))
+        try:
+            con.execute(f"""
+                CREATE OR REPLACE VIEW {dataset}.{table} AS
+                SELECT * FROM read_parquet([{file_list}], hive_partitioning=true, union_by_name=true)
+            """)
+            print(f"✓ {dataset}.{table} ({len(files)} files)")
+        except Exception as e:
+            if 'Geoparquet' in str(e) or 'geometria' in str(e) or 'geometry' in str(e).lower():
+                print(f"  skip (geoparquet) {dataset}.{table}")
+            else:
+                raise
+
+con.close()
+print("Done! Open with: duckdb --ui basedosdados.duckdb")