feat: add LLM SQL query assistant and dataset sampler

- ask.py: Python script to query Base dos Dados via natural language using Gemini, generates and executes DuckDB SQL from Portuguese questions - ask/ (Rust): CLI companion for the SQL query assistant with system prompt - sample_datasets.py: samples parquet files from S3 into a local DuckDB for exploration - sample_datasets/ (Rust): CLI for dataset sampling - context/: LLM context bundle (schemas, join keys, file tree) for query generation
2026-03-28 11:23:51 +01:00
parent 6801db427e
commit b5d84e3556
13 changed files with 156445 additions and 0 deletions
--- a/sample_datasets/Cargo.lock
+++ b/sample_datasets/Cargo.lock
--- a/sample_datasets/Cargo.toml
+++ b/sample_datasets/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "sample_datasets"
+version = "0.1.0"
+edition = "2021"
+
+[[bin]]
+name = "sample_datasets"
+path = "src/main.rs"
+
+[dependencies]
+duckdb = { version = "1", features = ["bundled"] }
+arrow = { version = "=58.0.0", features = ["prettyprint"] }
+dotenvy = "0.15"
+ctrlc = "3"
+anyhow = "1"
--- a/sample_datasets/src/main.rs
+++ b/sample_datasets/src/main.rs
@@ -0,0 +1,144 @@
+use duckdb::Connection;
+use std::fs::OpenOptions;
+use std::io::{BufWriter, Write};
+use std::sync::{Arc, Mutex};
+
+fn strip_scheme(url: &str) -> &str {
+    url.strip_prefix("https://")
+        .or_else(|| url.strip_prefix("http://"))
+        .unwrap_or(url)
+}
+
+fn main() -> anyhow::Result<()> {
+    dotenvy::dotenv().ok();
+
+    let endpoint_url = std::env::var("HETZNER_S3_ENDPOINT")?;
+    let access_key   = std::env::var("AWS_ACCESS_KEY_ID")?;
+    let secret_key   = std::env::var("AWS_SECRET_ACCESS_KEY")?;
+    let s3_endpoint  = strip_scheme(&endpoint_url).to_owned();
+
+    let con = Connection::open("basedosdados.duckdb")?;
+    con.execute_batch("INSTALL httpfs; LOAD httpfs;")?;
+    con.execute_batch(&format!(
+        "SET s3_endpoint='{s3_endpoint}';
+         SET s3_access_key_id='{access_key}';
+         SET s3_secret_access_key='{secret_key}';
+         SET s3_url_style='path';
+         SET enable_object_cache=true;
+         SET threads=4;
+         SET memory_limit='6GB';"
+    ))?;
+
+    let file = OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open("dataset_sample.txt")?;
+    let out = Arc::new(Mutex::new(BufWriter::new(file)));
+
+    let out_ctrlc = out.clone();
+    ctrlc::set_handler(move || {
+        eprintln!("\nCancelled.");
+        if let Ok(mut w) = out_ctrlc.lock() {
+            let _ = w.flush();
+        }
+        std::process::exit(0);
+    })?;
+
+    writeln!(out.lock().unwrap(), "# Dataset samples with Headers as column_name:column_type\n")?;
+
+    let mut schemas: Vec<String> = {
+        let mut stmt = con.prepare(
+            "SELECT schema_name FROM information_schema.schemata \
+             WHERE schema_name NOT IN ('main', 'information_schema', 'pg_catalog')"
+        )?;
+        stmt.query_map([], |row| row.get(0))?
+            .filter_map(|r| r.ok())
+            .collect()
+    };
+    schemas.sort();
+
+    for schema in &schemas {
+        let mut tables: Vec<String> = {
+            let mut stmt = con.prepare(
+                "SELECT table_name FROM information_schema.tables WHERE table_schema = ?"
+            )?;
+            stmt.query_map([schema], |row| row.get(0))?
+                .filter_map(|r| r.ok())
+                .collect()
+        };
+        tables.sort();
+
+        for table in &tables {
+            let full = format!("{schema}.{table}");
+            let result = (|| -> anyhow::Result<()> {
+                let safe_cols: Vec<String> = {
+                    let mut desc = con.prepare(&format!("DESCRIBE {full}"))?;
+                    desc.query_map([], |row| {
+                        let name: String = row.get(0)?;
+                        let col_type: String = row.get(1)?;
+                        Ok((name, col_type))
+                    })?
+                    .filter_map(|r| r.ok())
+                    .filter(|(_, t)| !t.to_uppercase().contains("GEOMETRY"))
+                    .map(|(n, _)| format!("\"{n}\""))
+                    .collect()
+                };
+
+                if safe_cols.is_empty() {
+                    return Ok(());
+                }
+
+                let col_list = safe_cols.join(", ");
+                let mut stmt = con.prepare(&format!(
+                    "SELECT {col_list} FROM {full} USING SAMPLE 2 ROWS"
+                ))?;
+
+                let batches: Vec<_> = stmt.query_arrow([])?.collect();
+
+                if batches.is_empty() {
+                    return Ok(());
+                }
+
+                let arrow_schema = batches[0].schema();
+                let header: Vec<String> = arrow_schema.fields().iter()
+                    .map(|f| format!("{}:{}", f.name(), f.data_type()))
+                    .collect();
+
+                let mut w = out.lock().unwrap();
+                writeln!(w, "## {schema}/{table}/")?;
+                writeln!(w, "{}", header.join(","))?;
+
+                for batch in &batches {
+                    for row_idx in 0..batch.num_rows() {
+                        let vals: Vec<String> = batch.columns().iter().map(|col| {
+                            use arrow::array::Array;
+                            if col.is_null(row_idx) {
+                                return String::new();
+                            }
+                            arrow::util::display::array_value_to_string(col, row_idx)
+                                .unwrap_or_default()
+                        }).collect();
+                        writeln!(w, "{}", vals.join(","))?;
+                    }
+                }
+
+                writeln!(w)?;
+                w.flush()?;
+                Ok(())
+            })();
+
+            match result {
+                Ok(_) => println!("done: {full}"),
+                Err(e) => {
+                    let mut w = out.lock().unwrap();
+                    writeln!(w, "## {schema}/{table}/")?;
+                    writeln!(w, "[error: {e}]\n")?;
+                    let _ = w.flush();
+                    eprintln!("error: {full}: {e}");
+                }
+            }
+        }
+    }
+
+    Ok(())
+}