import duckdb import os from dotenv import load_dotenv load_dotenv() BUCKET = os.environ['HETZNER_S3_BUCKET'] ENDPOINT_URL = os.environ['HETZNER_S3_ENDPOINT'] ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID'] SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY'] s3_endpoint = ENDPOINT_URL.removeprefix('https://').removeprefix('http://') con = duckdb.connect('basedosdados.duckdb') con.execute("LOAD httpfs;") con.execute(f""" SET s3_endpoint='{s3_endpoint}'; SET s3_access_key_id='{ACCESS_KEY}'; SET s3_secret_access_key='{SECRET_KEY}'; SET s3_url_style='path'; SET enable_object_cache=true; SET threads=4; SET memory_limit='6GB'; """) schemas = [row[0] for row in con.execute( "SELECT schema_name FROM information_schema.schemata WHERE schema_name NOT IN ('main', 'information_schema', 'pg_catalog')" ).fetchall()] try: with open("dataset_sample.txt", "a") as f: f.write("# Dataset samples\n\n") for schema in sorted(schemas): tables = [row[0] for row in con.execute( f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema}'" ).fetchall()] for table in sorted(tables): full = f"{schema}.{table}" try: rows = con.execute( f"SELECT * FROM {full} USING SAMPLE 2 ROWS" ).fetchall() cols = [f"{d[0]}:{d[1]}" for d in con.description] f.write(f"## {schema}/{table}/\n") f.write(",".join(cols) + "\n") for row in rows: f.write(",".join("" if v is None else str(v) for v in row) + "\n") f.write("\n") f.flush() print(f"done: {full}") except Exception as e: f.write(f"## {schema}/{table}/\n[error: {e}]\n\n") f.flush() print(f"error: {full}: {e}") except KeyboardInterrupt: print("\nCancelled.") con.close()