- ask.py: Python script to query Base dos Dados via natural language using Gemini, generates and executes DuckDB SQL from Portuguese questions - ask/ (Rust): CLI companion for the SQL query assistant with system prompt - sample_datasets.py: samples parquet files from S3 into a local DuckDB for exploration - sample_datasets/ (Rust): CLI for dataset sampling - context/: LLM context bundle (schemas, join keys, file tree) for query generation
130 lines
3.8 KiB
Python
130 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ask.py — Send a Portuguese question to Gemini and get back SQL.
|
|
|
|
Usage:
|
|
python ask.py "Quantos pedidos foram feitos por cliente no último mês?"
|
|
python ask.py "Qual a taxa de mortalidade infantil por município em 2020?"
|
|
|
|
Env vars:
|
|
GEMINI_API_KEY — required
|
|
SCHEMA_FILE — path to DDL file (default: context/schema_compact_inline.txt)
|
|
GEMINI_MODEL — model slug (default: gemini-2.0-flash-latest)
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import requests
|
|
import duckdb
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
SCHEMA_FILE = os.getenv("SCHEMA_FILE", "context/schema_compact_inline.txt")
|
|
MODEL = os.getenv("GEMINI_MODEL", "gemini-flash-latest")
|
|
DB_FILE = os.getenv("DB_FILE", "basedosdados.duckdb")
|
|
|
|
|
|
def load_schema(path: str) -> str:
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
return f.read()
|
|
|
|
|
|
def ask(question: str) -> str:
|
|
api_key = os.getenv("GEMINI_API_KEY")
|
|
if not api_key:
|
|
sys.exit("Error: GEMINI_API_KEY not set")
|
|
|
|
schema_ddl = load_schema(SCHEMA_FILE)
|
|
|
|
system_prompt = (
|
|
"You are a SQL expert for Base dos Dados (basedosdados.org), "
|
|
"a Brazilian open data warehouse with tables accessed via DuckDB.\n\n"
|
|
"Rules:\n"
|
|
"- Use DuckDB syntax. Tables are referenced as dataset.table.\n"
|
|
"- Only use columns from the provided DDL — never invent column names.\n"
|
|
"- Add WHERE filters on ano, sigla_uf, or id_municipio whenever possible.\n"
|
|
"- Return ONLY the SQL query, no explanation, no markdown fences.\n\n"
|
|
f"Schema DDL:\n\n{schema_ddl}"
|
|
)
|
|
|
|
url = (
|
|
f"https://generativelanguage.googleapis.com/v1beta/models"
|
|
f"/{MODEL}:generateContent"
|
|
)
|
|
|
|
payload = {
|
|
"system_instruction": {
|
|
"parts": [{"text": system_prompt}]
|
|
},
|
|
"contents": [
|
|
{
|
|
"parts": [{"text": question}]
|
|
}
|
|
]
|
|
}
|
|
|
|
response = requests.post(
|
|
url,
|
|
headers={
|
|
"Content-Type": "application/json",
|
|
"X-goog-api-key": api_key,
|
|
},
|
|
data=json.dumps(payload),
|
|
timeout=300,
|
|
)
|
|
|
|
response.raise_for_status()
|
|
result = response.json()
|
|
|
|
return result["candidates"][0]["content"]["parts"][0]["text"].strip()
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print(f"Usage: python {sys.argv[0]} \"<pergunta em português>\"", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
question = " ".join(sys.argv[1:])
|
|
print(f"Question: {question}\n", file=sys.stderr)
|
|
print(f"Model: {MODEL}\n", file=sys.stderr)
|
|
|
|
sql = ask(question)
|
|
|
|
print(f"\n── SQL ──────────────────────────────────────────\n{sql}\n", file=sys.stderr)
|
|
|
|
con = duckdb.connect(DB_FILE, read_only=True)
|
|
rel = con.sql(sql)
|
|
|
|
# box mode: build borders from column names + data
|
|
cols = rel.columns
|
|
rows = rel.fetchall()
|
|
|
|
if not rows:
|
|
print("(no rows returned)")
|
|
return
|
|
|
|
col_widths = [len(c) for c in cols]
|
|
for row in rows:
|
|
for i, val in enumerate(row):
|
|
col_widths[i] = max(col_widths[i], len(str(val) if val is not None else "NULL"))
|
|
|
|
def bar(left, mid, right, fill="─"):
|
|
return left + mid.join(fill * (w + 2) for w in col_widths) + right
|
|
|
|
header = "│" + "│".join(f" {c:{w}} " for c, w in zip(cols, col_widths)) + "│"
|
|
|
|
print(bar("┌", "┬", "┐"))
|
|
print(header)
|
|
print(bar("├", "┼", "┤"))
|
|
for row in rows:
|
|
vals = [str(v) if v is not None else "NULL" for v in row]
|
|
print("│" + "│".join(f" {v:{w}} " for v, w in zip(vals, col_widths)) + "│")
|
|
print(bar("└", "┴", "┘"))
|
|
print(f"\n{len(rows)} row(s)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|