feat: add LLM SQL query assistant and dataset sampler

- ask.py: Python script to query Base dos Dados via natural language using Gemini,
  generates and executes DuckDB SQL from Portuguese questions
- ask/ (Rust): CLI companion for the SQL query assistant with system prompt
- sample_datasets.py: samples parquet files from S3 into a local DuckDB for exploration
- sample_datasets/ (Rust): CLI for dataset sampling
- context/: LLM context bundle (schemas, join keys, file tree) for query generation
This commit is contained in:
2026-03-28 11:23:51 +01:00
parent 6801db427e
commit b5d84e3556
13 changed files with 156445 additions and 0 deletions

129
ask.py Normal file
View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
ask.py — Send a Portuguese question to Gemini and get back SQL.
Usage:
python ask.py "Quantos pedidos foram feitos por cliente no último mês?"
python ask.py "Qual a taxa de mortalidade infantil por município em 2020?"
Env vars:
GEMINI_API_KEY — required
SCHEMA_FILE — path to DDL file (default: context/schema_compact_inline.txt)
GEMINI_MODEL — model slug (default: gemini-2.0-flash-latest)
"""
import os
import sys
import json
import requests
import duckdb
from dotenv import load_dotenv
load_dotenv()
SCHEMA_FILE = os.getenv("SCHEMA_FILE", "context/schema_compact_inline.txt")
MODEL = os.getenv("GEMINI_MODEL", "gemini-flash-latest")
DB_FILE = os.getenv("DB_FILE", "basedosdados.duckdb")
def load_schema(path: str) -> str:
with open(path, "r", encoding="utf-8") as f:
return f.read()
def ask(question: str) -> str:
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
sys.exit("Error: GEMINI_API_KEY not set")
schema_ddl = load_schema(SCHEMA_FILE)
system_prompt = (
"You are a SQL expert for Base dos Dados (basedosdados.org), "
"a Brazilian open data warehouse with tables accessed via DuckDB.\n\n"
"Rules:\n"
"- Use DuckDB syntax. Tables are referenced as dataset.table.\n"
"- Only use columns from the provided DDL — never invent column names.\n"
"- Add WHERE filters on ano, sigla_uf, or id_municipio whenever possible.\n"
"- Return ONLY the SQL query, no explanation, no markdown fences.\n\n"
f"Schema DDL:\n\n{schema_ddl}"
)
url = (
f"https://generativelanguage.googleapis.com/v1beta/models"
f"/{MODEL}:generateContent"
)
payload = {
"system_instruction": {
"parts": [{"text": system_prompt}]
},
"contents": [
{
"parts": [{"text": question}]
}
]
}
response = requests.post(
url,
headers={
"Content-Type": "application/json",
"X-goog-api-key": api_key,
},
data=json.dumps(payload),
timeout=300,
)
response.raise_for_status()
result = response.json()
return result["candidates"][0]["content"]["parts"][0]["text"].strip()
def main():
if len(sys.argv) < 2:
print(f"Usage: python {sys.argv[0]} \"<pergunta em português>\"", file=sys.stderr)
sys.exit(1)
question = " ".join(sys.argv[1:])
print(f"Question: {question}\n", file=sys.stderr)
print(f"Model: {MODEL}\n", file=sys.stderr)
sql = ask(question)
print(f"\n── SQL ──────────────────────────────────────────\n{sql}\n", file=sys.stderr)
con = duckdb.connect(DB_FILE, read_only=True)
rel = con.sql(sql)
# box mode: build borders from column names + data
cols = rel.columns
rows = rel.fetchall()
if not rows:
print("(no rows returned)")
return
col_widths = [len(c) for c in cols]
for row in rows:
for i, val in enumerate(row):
col_widths[i] = max(col_widths[i], len(str(val) if val is not None else "NULL"))
def bar(left, mid, right, fill=""):
return left + mid.join(fill * (w + 2) for w in col_widths) + right
header = "" + "".join(f" {c:{w}} " for c, w in zip(cols, col_widths)) + ""
print(bar("", "", ""))
print(header)
print(bar("", "", ""))
for row in rows:
vals = [str(v) if v is not None else "NULL" for v in row]
print("" + "".join(f" {v:{w}} " for v, w in zip(vals, col_widths)) + "")
print(bar("", "", ""))
print(f"\n{len(rows)} row(s)")
if __name__ == "__main__":
main()