refactor: reorganize project structure and fix broken references
- Move scripts to scripts/ directory (roda.sh, prepara_db.py, etc.) - Move shell config to shell/ directory (Caddyfile, auth.py, haloy.yml) - Move basedosdados.duckdb to data/ directory - Update Dockerfile and start.sh with new file paths - Update README.md with correct script paths - Remove Python ask.py (replaced by Rust binary in ask/ask) - Add Rust source files (schema_filter.rs, sql_generator.rs, table_selector.rs) - Remove sentence-transformer dependencies from ask - Move docs and context artifacts to their directories
This commit is contained in:
62
scripts/sample_datasets.py
Normal file
62
scripts/sample_datasets.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import duckdb
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
BUCKET = os.environ['HETZNER_S3_BUCKET']
|
||||
ENDPOINT_URL = os.environ['HETZNER_S3_ENDPOINT']
|
||||
ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
|
||||
SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
||||
|
||||
s3_endpoint = ENDPOINT_URL.removeprefix('https://').removeprefix('http://')
|
||||
|
||||
con = duckdb.connect('basedosdados.duckdb')
|
||||
con.execute("LOAD httpfs;")
|
||||
con.execute(f"""
|
||||
SET s3_endpoint='{s3_endpoint}';
|
||||
SET s3_access_key_id='{ACCESS_KEY}';
|
||||
SET s3_secret_access_key='{SECRET_KEY}';
|
||||
SET s3_url_style='path';
|
||||
SET enable_object_cache=true;
|
||||
SET threads=4;
|
||||
SET memory_limit='6GB';
|
||||
""")
|
||||
|
||||
schemas = [row[0] for row in con.execute(
|
||||
"SELECT schema_name FROM information_schema.schemata WHERE schema_name NOT IN ('main', 'information_schema', 'pg_catalog')"
|
||||
).fetchall()]
|
||||
|
||||
try:
|
||||
with open("dataset_sample.txt", "a") as f:
|
||||
f.write("# Dataset samples\n\n")
|
||||
|
||||
for schema in sorted(schemas):
|
||||
tables = [row[0] for row in con.execute(
|
||||
f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema}'"
|
||||
).fetchall()]
|
||||
|
||||
for table in sorted(tables):
|
||||
full = f"{schema}.{table}"
|
||||
try:
|
||||
rows = con.execute(
|
||||
f"SELECT * FROM {full} USING SAMPLE 2 ROWS"
|
||||
).fetchall()
|
||||
cols = [f"{d[0]}:{d[1]}" for d in con.description]
|
||||
|
||||
f.write(f"## {schema}/{table}/\n")
|
||||
f.write(",".join(cols) + "\n")
|
||||
for row in rows:
|
||||
f.write(",".join("" if v is None else str(v) for v in row) + "\n")
|
||||
f.write("\n")
|
||||
f.flush()
|
||||
print(f"done: {full}")
|
||||
except Exception as e:
|
||||
f.write(f"## {schema}/{table}/\n[error: {e}]\n\n")
|
||||
f.flush()
|
||||
print(f"error: {full}: {e}")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nCancelled.")
|
||||
|
||||
con.close()
|
||||
Reference in New Issue
Block a user