Files
baseldosdados/sample_datasets.py
rafapolo b5d84e3556 feat: add LLM SQL query assistant and dataset sampler
- ask.py: Python script to query Base dos Dados via natural language using Gemini,
  generates and executes DuckDB SQL from Portuguese questions
- ask/ (Rust): CLI companion for the SQL query assistant with system prompt
- sample_datasets.py: samples parquet files from S3 into a local DuckDB for exploration
- sample_datasets/ (Rust): CLI for dataset sampling
- context/: LLM context bundle (schemas, join keys, file tree) for query generation
2026-03-28 11:23:51 +01:00

63 lines
2.0 KiB
Python

import duckdb
import os
from dotenv import load_dotenv
load_dotenv()
BUCKET = os.environ['HETZNER_S3_BUCKET']
ENDPOINT_URL = os.environ['HETZNER_S3_ENDPOINT']
ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
s3_endpoint = ENDPOINT_URL.removeprefix('https://').removeprefix('http://')
con = duckdb.connect('basedosdados.duckdb')
con.execute("LOAD httpfs;")
con.execute(f"""
SET s3_endpoint='{s3_endpoint}';
SET s3_access_key_id='{ACCESS_KEY}';
SET s3_secret_access_key='{SECRET_KEY}';
SET s3_url_style='path';
SET enable_object_cache=true;
SET threads=4;
SET memory_limit='6GB';
""")
schemas = [row[0] for row in con.execute(
"SELECT schema_name FROM information_schema.schemata WHERE schema_name NOT IN ('main', 'information_schema', 'pg_catalog')"
).fetchall()]
try:
with open("dataset_sample.txt", "a") as f:
f.write("# Dataset samples\n\n")
for schema in sorted(schemas):
tables = [row[0] for row in con.execute(
f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema}'"
).fetchall()]
for table in sorted(tables):
full = f"{schema}.{table}"
try:
rows = con.execute(
f"SELECT * FROM {full} USING SAMPLE 2 ROWS"
).fetchall()
cols = [f"{d[0]}:{d[1]}" for d in con.description]
f.write(f"## {schema}/{table}/\n")
f.write(",".join(cols) + "\n")
for row in rows:
f.write(",".join("" if v is None else str(v) for v in row) + "\n")
f.write("\n")
f.flush()
print(f"done: {full}")
except Exception as e:
f.write(f"## {schema}/{table}/\n[error: {e}]\n\n")
f.flush()
print(f"error: {full}: {e}")
except KeyboardInterrupt:
print("\nCancelled.")
con.close()