Compare commits

..

4 Commits

Author SHA1 Message Date
ed81e52254 docs: reorder README for data users, remove unused files (xdg-open, gera_schemas.py, open_gui.sh, docs/) 2026-03-26 12:01:46 +01:00
5239a03ea8 docs: expand /query curl usage, remove outdated UI references 2026-03-26 11:58:14 +01:00
41e7f7a972 replace duckdb-ui with ttyd shell: add /query HTTP endpoint, fix utf-8/locale, region config
- swap DuckDB UI for ttyd web terminal (--writable, -readonly db)
- add POST /query endpoint with X-Password auth for curl-based SQL execution
- fix UTF-8 rendering: set LANG/LC_ALL=C.UTF-8 in container
- pass BUCKET_REGION env var for correct S3 signing region
- simplify start.sh: drop Xvfb, views.duckdb generation, blocking duckdb -ui
- add less, ncurses-bin to Dockerfile for proper pager/terminal support
- update Caddyfile: single route to ttyd with flush_interval -1 for websocket
- update README to reflect current architecture and document /query usage
- remove duckdb-ui.service, schemas.json, file_tree.md (generated artifacts)
2026-03-26 11:54:46 +01:00
cd94603fac update haloy config: use hostname for server, fix env var format 2026-03-25 13:39:15 +01:00
14 changed files with 317 additions and 148873 deletions

View File

@@ -12,3 +12,6 @@ S3_CONCURRENCY=""
PARALLEL_UPLOADS=""
AWS_ACCESS_KEY_ID=""
AWS_SECRET_ACCESS_KEY=""
# Caddy serving duckdb
BASIC_AUTH_PASSWORD=""

2
.gitignore vendored
View File

@@ -3,3 +3,5 @@
logs/
done_tables.txt
done_transfers.txt
# CocoIndex Code (ccc)
/.cocoindex_code/

View File

@@ -1,7 +1,24 @@
:8080 {
basic_auth /* {
# Set BASIC_AUTH_HASH on the server: caddy hash-password --plaintext 'YOUR_PWD'
admin {env.BASIC_AUTH_HASH}
handle /health {
respond 200
}
reverse_proxy localhost:4213
handle /login {
reverse_proxy 127.0.0.1:8081
}
handle /query {
reverse_proxy 127.0.0.1:8081
}
handle {
forward_auth 127.0.0.1:8081 {
uri /auth
copy_headers Cookie
}
reverse_proxy localhost:7681 {
flush_interval -1
}
}
}

View File

@@ -2,19 +2,30 @@ FROM debian:12-slim
RUN apt-get update -qq && \
apt-get install -y --no-install-recommends \
curl ca-certificates unzip && \
curl -fsSL https://caddyserver.com/install.sh | bash && \
curl ca-certificates unzip bsdmainutils python3 \
less ncurses-bin && \
curl -fsSL \
"https://github.com/duckdb/duckdb/releases/latest/download/duckdb_cli-linux-amd64.zip" \
"https://github.com/caddyserver/caddy/releases/download/v2.9.1/caddy_2.9.1_linux_amd64.tar.gz" \
| tar -xz -C /usr/local/bin caddy && \
chmod +x /usr/local/bin/caddy && \
curl -fsSL \
"https://github.com/duckdb/duckdb/releases/download/v1.5.1/duckdb_cli-linux-amd64.zip" \
-o /tmp/duckdb.zip && \
unzip /tmp/duckdb.zip -d /usr/local/bin && \
chmod +x /usr/local/bin/duckdb && \
rm /tmp/duckdb.zip && \
apt-get clean && rm -rf /var/lib/apt/lists/*
apt-get clean && rm -rf /var/lib/apt/lists/* && \
duckdb :memory: "INSTALL httpfs;" && \
curl -fsSL "https://github.com/tsl0922/ttyd/releases/latest/download/ttyd.x86_64" \
-o /usr/local/bin/ttyd && \
chmod +x /usr/local/bin/ttyd
ENV LANG=C.UTF-8 \
LC_ALL=C.UTF-8
WORKDIR /app
COPY basedosdados.duckdb Caddyfile start.sh ./
COPY basedosdados.duckdb Caddyfile start.sh auth.py ./
RUN chmod +x start.sh
EXPOSE 8080

226
README.md
View File

@@ -1,80 +1,126 @@
# baseldosdados
Mirror completo das tabelas públicas do projeto [Base dos Dados](https://basedosdados.org/) — 533 tabelas, ~675 GB em Parquet+zstd — hospedado no Hetzner Object Storage e acessível via DuckDB.
Mirror completo das tabelas públicas do projeto [Base dos Dados](https://basedosdados.org/) — 533 tabelas, ~675 GB em Parquet+zstd.
## O que tem aqui
Os dados foram exportados do BigQuery para o Hetzner Object Storage (Helsinki) no formato Parquet com compressão zstd, organizados por dataset e tabela. O acesso é feito diretamente sobre os arquivos via DuckDB, sem necessidade de importar nada localmente — as queries leem os parquets do S3 sob demanda.
| Script | Função |
|---|---|
| `roda.sh` | Exporta BigQuery → GCS → Hetzner S3 (pipeline principal) |
| `prepara_gui.py` | Cria views DuckDB sobre os parquets do S3 para exploração local |
| `gera_schemas.py` | Gera `schemas.json` e `file_tree.md` com metadados de todos os parquets |
---
**Arquivos gerados:**
## Consultando os dados
| Arquivo | Descrição |
|---|---|
| `schemas.json` | Schema completo de todas as 533 tabelas (colunas, tipos, tamanhos) |
| `file_tree.md` | Árvore do bucket S3 com tamanhos e contagem de arquivos |
| `basedosdados.duckdb` | Banco DuckDB com views para todas as tabelas (gerado por `prepara_gui.py`) |
| `all_tables.txt` | Lista completa de tabelas descobertas |
| `done_tables.txt` | Tabelas exportadas com sucesso para o GCS |
| `done_transfers.txt` | Datasets transferidos com sucesso para o S3 |
| `failed_tables.txt` | Tabelas que falharam após 3 tentativas |
Acesso via browser ou curl, protegido por senha. Peça a senha para o administrador.
## Fluxo de exportação
### Shell no browser
Acesse **https://db.xn--2dk.xyz** → autentique → shell DuckDB interativo direto no browser.
### SQL via curl
Endpoint `POST /query` — SQL no body, resultado como texto plano:
```bash
# Query inline
curl -s -X POST https://db.xn--2dk.xyz/query \
-H "X-Password: <senha>" \
--data-binary "SELECT count(*) FROM br_anatel_banda_larga_fixa.densidade_brasil"
# A partir de um arquivo .sql
curl -s -X POST https://db.xn--2dk.xyz/query \
-H "X-Password: <senha>" \
--data-binary @minha_query.sql
# Heredoc (útil em scripts)
curl -s -X POST https://db.xn--2dk.xyz/query \
-H "X-Password: <senha>" \
--data-binary @- << 'SQL'
SELECT sigla_uf, sum(densidade) AS total
FROM br_anatel_banda_larga_fixa.densidade_uf
WHERE ano = 2023
GROUP BY 1
ORDER BY 2 DESC
SQL
# Salvar resultado em arquivo
curl -s -X POST https://db.xn--2dk.xyz/query \
-H "X-Password: <senha>" \
--data-binary @query.sql > resultado.csv
```
### Descobrindo tabelas
```sql
-- listar todos os datasets (schemas)
SHOW SCHEMAS;
-- listar tabelas de um dataset
SHOW TABLES IN br_anatel_banda_larga_fixa;
-- ver colunas de uma tabela
DESCRIBE br_anatel_banda_larga_fixa.densidade_brasil;
```
No shell do browser, `.tables` lista tudo de uma vez.
### Exportar em CSV ou JSON
O DuckDB permite formatar a saída diretamente na query:
```sql
-- CSV com header (pipe para arquivo via curl)
COPY (SELECT * FROM br_ibge_censo2022.municipios LIMIT 1000)
TO '/dev/stdout' (FORMAT csv, HEADER true);
-- JSON
SELECT * FROM br_ibge_censo2022.municipios LIMIT 10
FORMAT JSON;
```
---
## Exploração local
Para rodar as queries na sua própria máquina com DuckDB instalado:
```bash
python prepara_db.py # gera basedosdados.duckdb com views apontando para o S3
duckdb basedosdados.duckdb
```
Requer as credenciais do S3 no `.env` (veja seção de configuração abaixo).
---
## Pipeline de exportação
> Seção para mantenedores — não necessário para consulta dos dados.
### Fluxo
```
BigQuery (basedosdados) → GCS (Parquet + zstd) → Hetzner Object Storage (rclone)
```
1. Descobre automaticamente todos os datasets e tabelas via API do BigQuery
2. Exporta todas as tabelas em paralelo no formato Parquet com compressão zstd
2. Exporta em paralelo no formato Parquet com compressão zstd
3. Transfere GCS → Hetzner Object Storage via rclone (streaming direto, sem disco local)
4. Verifica a contagem de arquivos entre GCS e S3
5. Oferece opção de deletar o bucket GCS ao final
4. Verifica contagem de arquivos entre GCS e S3
O script suporta **resume automático**: se interrompido, basta rodar novamente — tabelas e transfers já concluídos são pulados.
Resume automático: se interrompido, basta rodar novamente.
## Estrutura dos dados no S3
### Scripts
```
s3://<HETZNER_S3_BUCKET>/
└── <dataset>/
└── <tabela>/
└── *.parquet
```
| Script | Função |
|---|---|
| `roda.sh` | Pipeline principal de exportação |
| `prepara_db.py` | Gera `basedosdados.duckdb` com views para todas as tabelas |
## Pré-requisitos
**Exportação (`roda.sh`) — execução local:**
- `google-cloud-sdk` (`bq`, `gcloud`, `gsutil`)
- `parallel` (GNU parallel)
- `rclone`
- `flock`
**Execução via VM (`--gcloud-run`):** apenas `gcloud` localmente — dependências instaladas automaticamente na VM.
**Scripts Python** (`prepara_gui.py`, `gera_schemas.py`):
- `duckdb`, `pyarrow`, `boto3`, `s3fs`, `python-dotenv`
Autenticação GCP (uma vez antes da exportação):
```bash
gcloud auth login
gcloud auth application-default login
gcloud config set project SEU_PROJECT_ID
```
## Configuração
Crie um arquivo `.env` na raiz:
### Configuração (`.env`)
| Variável | Descrição |
|---|---|
| `YOUR_PROJECT` | ID do seu projeto GCP (para faturamento) |
| `YOUR_PROJECT` | ID do projeto GCP (para faturamento) |
| `BUCKET_NAME` | Nome do bucket GCS intermediário |
| `BUCKET_REGION` | Região do bucket — deve ser `US` |
| `BUCKET_REGION` | Região do bucket S3 (ex: `eu-central`) |
| `SOURCE_PROJECT` | Projeto fonte (`basedosdados`) |
| `PARALLEL_EXPORTS` | Jobs paralelos de exportação BigQuery (padrão: 8) |
| `HETZNER_S3_BUCKET` | Nome do bucket no Hetzner Object Storage |
@@ -83,74 +129,36 @@ Crie um arquivo `.env` na raiz:
| `PARALLEL_UPLOADS` | Datasets enviados em paralelo (padrão: 4) |
| `AWS_ACCESS_KEY_ID` | Access key do Hetzner Object Storage |
| `AWS_SECRET_ACCESS_KEY` | Secret key do Hetzner Object Storage |
| `BASIC_AUTH_PASSWORD` | Senha do shell web e endpoint `/query` |
## Uso
### Executando
```bash
# Exportação
chmod +x roda.sh
./roda.sh --dry-run # estima tamanho e custo antes de rodar
./roda.sh --dry-run # estima tamanho e custo
./roda.sh # execução local
./roda.sh --gcloud-run # cria VM no GCP, roda lá e deleta a VM ao final
# Exploração via DuckDB
python prepara_gui.py # cria basedosdados.duckdb com views para todas as tabelas
duckdb --ui basedosdados.duckdb
# Dump de schemas
python gera_schemas.py # gera schemas.json e file_tree.md (~21 MB de egress)
./roda.sh --gcloud-run # cria VM no GCP, roda lá e deleta ao final
```
## Servidor com UI protegida por senha
Para expor o DuckDB UI num servidor com HTTPS e autenticação básica, use o [Caddy](https://caddyserver.com/) como reverse proxy.
**Pré-requisitos no servidor:** `caddy`, `htpasswd` (pacote `apache2-utils`), `duckdb`
**1. Instalar o serviço DuckDB UI**
Edite `duckdb-ui.service` com o usuário e caminho corretos e copie para o systemd:
Autenticação GCP necessária antes da primeira exportação:
```bash
# edite User= e WorkingDirectory= e EnvironmentFile= no arquivo
cp duckdb-ui.service /etc/systemd/system/
systemctl daemon-reload
systemctl enable --now duckdb-ui
gcloud auth login
gcloud auth application-default login
gcloud config set project SEU_PROJECT_ID
```
**2. Configurar o Caddy**
#### `--gcloud-run`
Edite `Caddyfile` substituindo `your.domain.com` pelo domínio real, depois:
```bash
cp Caddyfile /etc/caddy/Caddyfile
systemctl reload caddy
```
O Caddy obtém o certificado TLS via Let's Encrypt automaticamente (portas 80 e 443 abertas no firewall).
**Trocar a senha:**
```bash
htpasswd -nbB -C 10 admin NOVA_SENHA | cut -d: -f2 | base64
# cole o resultado no Caddyfile no lugar do hash atual, depois:
systemctl reload caddy
```
**Arquivos relevantes:**
| Arquivo | Função |
|---|---|
| `Caddyfile` | Config do Caddy: HTTPS + basicauth → proxy para localhost:4213 |
| `duckdb-ui.service` | Serviço systemd que sobe o DuckDB UI em background |
---
### `--gcloud-run`
Cria uma VM `e2-standard-4` Debian 12 em `us-central1-a`, copia o script e o `.env`, instala as dependências e executa via SSH. Variáveis opcionais:
Cria uma VM `e2-standard-4` Debian 12 em `us-central1-a`, copia o script e o `.env`, instala dependências e executa via SSH.
| Variável | Padrão | Descrição |
|---|---|---|
| `GCP_VM_NAME` | `bd-export-vm` | Nome da instância |
| `GCP_VM_ZONE` | `us-central1-a` | Zona do Compute Engine |
### Deploy do servidor
```bash
haloy deploy
```

96
auth.py Normal file
View File

@@ -0,0 +1,96 @@
#!/usr/bin/env python3
"""Minimal cookie-session auth gate for DuckDB shell."""
import hmac, hashlib, os, secrets, subprocess, time
from http.server import HTTPServer, BaseHTTPRequestHandler
from urllib.parse import parse_qs
PASSWORD = os.environ.get('BASIC_AUTH_PASSWORD', '').encode()
_SECRET = secrets.token_bytes(32)
def _make_token():
day = str(int(time.time()) // 86400)
return hmac.new(_SECRET, day.encode(), hashlib.sha256).hexdigest()
def _valid(token):
if not token:
return False
for delta in (0, 1):
day = str(int(time.time()) // 86400 - delta)
expected = hmac.new(_SECRET, day.encode(), hashlib.sha256).hexdigest()
if hmac.compare_digest(token, expected):
return True
return False
LOGIN_HTML = """<!DOCTYPE html>
<html><head><title>DB Shell</title><style>
body{display:flex;justify-content:center;align-items:center;min-height:100vh;margin:0;background:#0f1117;font-family:sans-serif}
form{background:#1a1d27;padding:2rem;border-radius:8px;display:flex;flex-direction:column;gap:1rem;min-width:280px}
h2{color:#fff;margin:0}
input{padding:.6rem;border-radius:4px;border:1px solid #333;background:#0f1117;color:#fff;font-size:1rem}
button{padding:.6rem;border-radius:4px;border:none;background:#f4c543;color:#000;font-size:1rem;cursor:pointer;font-weight:600}
</style></head>
<body><form method="POST" action="/login">
<h2>DB Shell</h2>
<input type="password" name="password" placeholder="Password" autofocus>
<button type="submit">Enter</button>
</form></body></html>""".encode()
class H(BaseHTTPRequestHandler):
def _cookie(self):
for part in self.headers.get('Cookie', '').split(';'):
part = part.strip()
if part.startswith('ddb_auth='):
return part[9:]
return ''
def do_GET(self):
if self.path == '/auth':
if _valid(self._cookie()):
self._resp(200)
else:
self.send_response(302)
self.send_header('Location', '/login')
self.end_headers()
else:
self._resp(200, LOGIN_HTML, 'text/html; charset=utf-8')
def do_POST(self):
if self.path == '/query':
pwd = self.headers.get('X-Password', '').encode()
if not hmac.compare_digest(pwd, PASSWORD):
self._resp(401, b'Unauthorized\n')
return
sql = self.rfile.read(int(self.headers.get('Content-Length', 0))).decode(errors='replace')
try:
r = subprocess.run(
['duckdb', '-readonly', '--init', '/app/ssh_init.sql', '/app/basedosdados.duckdb'],
input=sql, capture_output=True, text=True, timeout=120
)
out = (r.stdout + r.stderr).encode()
except subprocess.TimeoutExpired:
out = b'timeout\n'
self._resp(200, out, 'text/plain; charset=utf-8')
return
body = self.rfile.read(int(self.headers.get('Content-Length', 0))).decode(errors='replace')
pwd = parse_qs(body).get('password', [''])[0].encode()
if hmac.compare_digest(pwd, PASSWORD):
self.send_response(302)
self.send_header('Set-Cookie', f'ddb_auth={_make_token()}; Path=/; HttpOnly; SameSite=Strict')
self.send_header('Location', '/')
self.end_headers()
else:
self._resp(200, LOGIN_HTML, 'text/html; charset=utf-8')
def _resp(self, code, body=b'', ct='text/plain'):
self.send_response(code)
if body:
self.send_header('Content-Type', ct)
self.send_header('Content-Length', str(len(body)))
self.end_headers()
if body:
self.wfile.write(body)
def log_message(self, *_):
pass
HTTPServer(('127.0.0.1', 8081), H).serve_forever()

View File

@@ -1,16 +0,0 @@
[Unit]
Description=DuckDB UI - basedosdados explorer
After=network.target
[Service]
Type=simple
User=YOUR_USER
WorkingDirectory=/path/to/baseldosdados
ExecStartPre=/usr/bin/python3 prepara_gui.py
ExecStart=/usr/bin/duckdb --ui basedosdados.duckdb
Restart=on-failure
RestartSec=5s
EnvironmentFile=/path/to/baseldosdados/.env
[Install]
WantedBy=multi-user.target

View File

@@ -1,870 +0,0 @@
# S3 File Tree: baseldosdados
## br_anatel_banda_larga_fixa/ (4 tables, 206.1 MB, 119 files)
- **densidade_brasil/** (1 files, 2.9 KB, 3 cols)
- **densidade_municipio/** (2 files, 10.3 MB, 5 cols)
- **densidade_uf/** (1 files, 50.8 KB, 4 cols)
- **microdados/** (115 files, 195.8 MB, 12 cols)
## br_anatel_indice_brasileiro_conectividade/ (1 tables, 443.4 KB, 1 files)
- **municipio/** (1 files, 443.4 KB, 11 cols)
## br_anp_precos_combustiveis/ (1 tables, 79.0 MB, 69 files)
- **microdados/** (69 files, 79.0 MB, 14 cols)
## br_ans_beneficiario/ (1 tables, 8.3 GB, 3573 files)
- **informacao_consolidada/** (3573 files, 8.3 GB, 22 cols)
## br_bcb_estban/ (3 tables, 2.3 GB, 2207 files)
- **agencia/** (1524 files, 1.4 GB, 9 cols)
- **dicionario/** (1 files, 2.9 KB, 5 cols)
- **municipio/** (682 files, 894.3 MB, 10 cols)
## br_bcb_sicor/ (11 tables, 19.6 GB, 2056 files)
- **dicionario/** (1 files, 9.3 KB, 5 cols)
- **empreendimento/** (1 files, 51.6 KB, 15 cols)
- **liberacao/** (65 files, 155.7 MB, 8 cols)
- **operacao/** (127 files, 521.6 MB, 53 cols)
- **operacoes_desclassificadas/** (1 files, 160.8 KB, 8 cols)
- **recurso_publico_complemento_operacao/** (44 files, 244.7 MB, 8 cols)
- **recurso_publico_cooperado/** (1 files, 1.2 MB, 9 cols)
- **recurso_publico_gleba/** (61 files, 3.8 GB, 7 cols)
- **recurso_publico_mutuario/** (29 files, 297.1 MB, 10 cols)
- **recurso_publico_propriedade/** (15 files, 491.1 MB, 9 cols)
- **saldo/** (1711 files, 14.2 GB, 10 cols)
## br_bd_diretorios_brasil/ (23 tables, 141.2 MB, 357 files)
- **area_conhecimento/** (1 files, 21.1 KB, 8 cols)
- **cbo_1994/** (1 files, 41.3 KB, 2 cols)
- **cbo_2002/** (1 files, 74.3 KB, 11 cols)
- **cep/** (1 files, 119.0 MB, 8 cols)
- **cid_10/** (1 files, 260.7 KB, 9 cols)
- **cid_9/** (1 files, 16.8 KB, 2 cols)
- **cnae_1/** (1 files, 23.4 KB, 8 cols)
- **cnae_2/** (1 files, 57.8 KB, 14 cols)
- **curso_superior/** (1 files, 6.2 KB, 5 cols)
- **distrito_1991/** (1 files, 126.6 KB, 4 cols)
- **distrito_2000/** (1 files, 145.4 KB, 4 cols)
- **distrito_2010/** (1 files, 150.1 KB, 4 cols)
- **empresa/** (335 files, 1.5 MB, 32 cols)
- **escola/** (1 files, 12.8 MB, 19 cols)
- **etnia_indigena/** (1 files, 5.4 KB, 2 cols)
- **instituicao_ensino_superior/** (1 files, 118.3 KB, 7 cols)
- **municipio/** (1 files, 328.0 KB, 27 cols)
- **natureza_juridica/** (1 files, 16.5 KB, 3 cols)
- **regiao/** (1 files, 637.0 B, 2 cols)
- **setor_censitario_2010/** (1 files, 1.6 MB, 14 cols)
- **setor_censitario_2022/** (1 files, 4.9 MB, 22 cols)
- **subatividade_ibge/** (1 files, 7.1 KB, 2 cols)
- **uf/** (1 files, 1.6 KB, 4 cols)
## br_bd_diretorios_mundo/ (4 tables, 1.4 MB, 4 files)
- **continente/** (1 files, 1.0 KB, 3 cols)
- **nomenclatura_comum_mercosul/** (1 files, 790.3 KB, 14 cols)
- **pais/** (1 files, 19.1 KB, 13 cols)
- **sistema_harmonizado/** (1 files, 655.9 KB, 16 cols)
## br_bd_metadados/ (2 tables, 14.7 MB, 2 files)
- **bigquery_tables/** (1 files, 68.6 KB, 10 cols)
- **prefect_flow_runs/** (1 files, 14.6 MB, 16 cols)
## br_camara_dados_abertos/ (28 tables, 267.7 MB, 222 files)
- **deputado/** (1 files, 277.8 KB, 12 cols)
- **deputado_ocupacao/** (1 files, 587.2 KB, 6 cols)
- **deputado_profissao/** (1 files, 58.0 KB, 5 cols)
- **despesa/** (22 files, 125.4 MB, 25 cols)
- **evento/** (1 files, 7.2 MB, 11 cols)
- **evento_orgao/** (1 files, 322.0 KB, 3 cols)
- **evento_presenca_deputado/** (5 files, 6.0 MB, 4 cols)
- **evento_requerimento/** (1 files, 287.1 KB, 3 cols)
- **frente/** (1 files, 183.9 KB, 10 cols)
- **frente_deputado/** (1 files, 767.1 KB, 5 cols)
- **funcionario/** (1 files, 316.7 KB, 10 cols)
- **legislatura/** (1 files, 2.8 KB, 5 cols)
- **legislatura_mesa/** (1 files, 7.9 KB, 13 cols)
- **licitacao/** (1 files, 327.5 KB, 18 cols)
- **licitacao_contrato/** (1 files, 198.6 KB, 19 cols)
- **licitacao_item/** (1 files, 6.9 MB, 21 cols)
- **licitacao_pedido/** (1 files, 597.0 KB, 11 cols)
- **licitacao_proposta/** (1 files, 738.5 KB, 13 cols)
- **orgao/** (1 files, 176.6 KB, 11 cols)
- **orgao_deputado/** (1 files, 511.2 KB, 9 cols)
- **proposicao_autor/** (5 files, 8.9 MB, 8 cols)
- **proposicao_microdados/** (81 files, 72.5 MB, 25 cols)
- **proposicao_tema/** (80 files, 2.1 MB, 6 cols)
- **votacao/** (1 files, 10.4 MB, 17 cols)
- **votacao_objeto/** (3 files, 8.7 MB, 10 cols)
- **votacao_orientacao_bancada/** (1 files, 335.7 KB, 5 cols)
- **votacao_parlamentar/** (5 files, 10.8 MB, 9 cols)
- **votacao_proposicao/** (1 files, 3.1 MB, 10 cols)
## br_ce_fortaleza_sefin_iptu/ (1 tables, 1.5 MB, 1 files)
- **face_quadra/** (1 files, 1.5 MB, 13 cols)
## br_cgu_beneficios_cidadao/ (6 tables, 61.4 GB, 6751 files)
- **auxilio_brasil/** (643 files, 3.0 GB, 10 cols)
- **auxilio_emergencial/** (1426 files, 5.9 GB, 14 cols)
- **bolsa_familia_pagamento/** (1479 files, 25.8 GB, 10 cols)
- **bpc/** (1667 files, 15.9 GB, 15 cols)
- **garantia_safra/** (92 files, 443.9 MB, 7 cols)
- **novo_bolsa_familia/** (1444 files, 10.4 GB, 10 cols)
## br_cgu_cartao_pagamento/ (4 tables, 40.2 MB, 30 files)
- **dicionario/** (1 files, 2.9 KB, 5 cols)
- **microdados_compras_centralizadas/** (12 files, 7.5 MB, 14 cols)
- **microdados_defesa_civil/** (1 files, 1.3 MB, 20 cols)
- **microdados_governo_federal/** (16 files, 31.5 MB, 15 cols)
## br_cgu_dados_abertos/ (3 tables, 6.9 MB, 3 files)
- **conjunto/** (1 files, 1.4 MB, 13 cols)
- **organizacao/** (1 files, 119.7 KB, 9 cols)
- **recurso/** (1 files, 5.3 MB, 11 cols)
## br_cgu_emendas_parlamentares/ (1 tables, 2.2 MB, 1 files)
- **microdados/** (1 files, 2.2 MB, 25 cols)
## br_cgu_licitacao_contrato/ (8 tables, 2.6 GB, 553 files)
- **contrato_apostilamento/** (1 files, 532.2 KB, 14 cols)
- **contrato_compra/** (1 files, 35.5 MB, 26 cols)
- **contrato_item/** (15 files, 65.9 MB, 12 cols)
- **contrato_termo_aditivo/** (1 files, 17.2 MB, 12 cols)
- **licitacao/** (22 files, 104.1 MB, 19 cols)
- **licitacao_empenho/** (39 files, 418.5 MB, 12 cols)
- **licitacao_item/** (66 files, 392.4 MB, 16 cols)
- **licitacao_participante/** (408 files, 1.6 GB, 15 cols)
## br_cgu_orcamento_publico/ (1 tables, 8.7 MB, 1 files)
- **orcamento/** (1 files, 8.7 MB, 26 cols)
## br_cgu_receitas_publicas/ (1 tables, 15.0 MB, 16 files)
- **receitas/** (16 files, 15.0 MB, 16 cols)
## br_cgu_servidores_executivo_federal/ (7 tables, 14.8 GB, 2993 files)
- **afastamentos/** (27 files, 72.7 MB, 8 cols)
- **cadastro_aposentados/** (133 files, 781.5 MB, 30 cols)
- **cadastro_pensionistas/** (167 files, 1.1 GB, 34 cols)
- **cadastro_reserva_reforma_militares/** (51 files, 223.1 MB, 29 cols)
- **cadastro_servidores/** (1413 files, 6.1 GB, 46 cols)
- **observacoes/** (73 files, 421.9 MB, 7 cols)
- **remuneracao/** (1129 files, 6.0 GB, 40 cols)
## br_cnj_improbidade_administrativa/ (1 tables, 2.6 MB, 1 files)
- **condenacao/** (1 files, 2.6 MB, 63 cols)
## br_cnpq_bolsas/ (2 tables, 86.6 MB, 21 files)
- **dicionario/** (1 files, 3.6 KB, 5 cols)
- **microdados/** (20 files, 86.6 MB, 31 cols)
## br_cvm_administradores_carteira/ (3 tables, 416.5 KB, 7 files)
- **pessoa_fisica/** (3 files, 120.3 KB, 7 cols)
- **pessoa_juridica/** (3 files, 238.6 KB, 24 cols)
- **responsavel/** (1 files, 57.6 KB, 3 cols)
## br_cvm_oferta_publica_distribuicao/ (1 tables, 1.2 MB, 1 files)
- **dia/** (1 files, 1.2 MB, 44 cols)
## br_datahackers_state_data/ (1 tables, 445.7 KB, 1 files)
- **microdados/** (1 files, 445.7 KB, 353 cols)
## br_fbsp_absp/ (2 tables, 41.3 KB, 2 files)
- **uf/** (1 files, 32.6 KB, 29 cols)
- **violencia_escola/** (1 files, 8.8 KB, 5 cols)
## br_fgv_igp/ (7 tables, 111.1 KB, 7 files)
- **igp_10_mes/** (1 files, 17.4 KB, 7 cols)
- **igp_di_ano/** (1 files, 4.2 KB, 5 cols)
- **igp_di_mes/** (1 files, 38.7 KB, 7 cols)
- **igp_m_ano/** (1 files, 2.9 KB, 5 cols)
- **igp_m_mes/** (1 files, 22.8 KB, 9 cols)
- **igp_og_ano/** (1 files, 3.3 KB, 5 cols)
- **igp_og_mes/** (1 files, 21.8 KB, 7 cols)
## br_geobr_mapas/ (25 tables, 245.7 MB, 26 files)
- **amazonia_legal/** (1 files, 212.8 KB, 1 cols)
- **area_minima_comparavel_2010/** (1 files, 12.5 MB, 3 cols)
- **area_risco_desastre/** (1 files, 1.7 MB, 8 cols)
- **arranjo_populacional/** (1 files, 2.1 MB, 8 cols)
- **bioma/** (2 files, 15.4 MB, 4 cols)
- **concentracao_urbana/** (1 files, 1.5 MB, 8 cols)
- **escola/** (1 files, 3.1 MB, 3 cols)
- **estabelecimentos_saude/** (1 files, 4.2 MB, 5 cols)
- **limite_vizinhanca/** (1 files, 5.1 MB, 12 cols)
- **mesorregiao/** (1 files, 3.4 MB, 4 cols)
- **microrregiao/** (1 files, 6.7 MB, 4 cols)
- **municipio/** (1 files, 17.2 MB, 3 cols)
- **pais/** (1 files, 455.7 KB, 1 cols)
- **pegada_urbana/** (1 files, 6.3 MB, 6 cols)
- **regiao/** (1 files, 884.8 KB, 3 cols)
- **regiao_imediata/** (1 files, 6.1 MB, 4 cols)
- **regiao_intermediaria/** (1 files, 3.9 MB, 4 cols)
- **regiao_metropolitana_2017/** (1 files, 2.8 MB, 8 cols)
- **saude/** (1 files, 2.1 MB, 4 cols)
- **sede_municipal/** (1 files, 384.0 KB, 8 cols)
- **semiarido/** (1 files, 2.0 MB, 3 cols)
- **setor_censitario_2010/** (1 files, 139.3 MB, 13 cols)
- **terra_indigena/** (1 files, 3.1 MB, 15 cols)
- **uf/** (1 files, 1.4 MB, 3 cols)
- **unidade_conservacao/** (1 files, 3.8 MB, 14 cols)
## br_ibge_censo_2022/ (16 tables, 4.1 GB, 512 files)
- **alfabetizacao_grupo_idade_sexo_raca/** (1 files, 1.7 MB, 6 cols)
- **cadastro_enderecos/** (425 files, 2.8 GB, 35 cols)
- **caracteristica_domicilio_grupo_idade_raca_destino_lixo/** (11 files, 9.9 MB, 6 cols)
- **caracteristica_domicilio_grupo_idade_raca_esgotamento_sanitario/** (14 files, 9.4 MB, 6 cols)
- **caracteristica_domicilio_grupo_idade_raca_ligacao_abastecimento_agua/** (20 files, 22.7 MB, 6 cols)
- **caracteristica_domicilio_grupo_idade_raca_tipo_domicilio/** (9 files, 5.4 MB, 6 cols)
- **dicionario/** (1 files, 2.2 KB, 5 cols)
- **domicilio_recenseado/** (1 files, 218.4 KB, 3 cols)
- **indice_envelhecimento_raca/** (1 files, 342.8 KB, 6 cols)
- **municipio/** (1 files, 177.5 KB, 13 cols)
- **populacao_grupo_idade_sexo_raca/** (5 files, 5.4 MB, 6 cols)
- **populacao_grupo_idade_uf/** (1 files, 2.9 KB, 3 cols)
- **populacao_idade_sexo/** (7 files, 5.7 MB, 7 cols)
- **setor_censitario/** (13 files, 1.3 GB, 1423 cols)
- **terra_indigena/** (1 files, 14.7 KB, 6 cols)
- **territorio_quilombola/** (1 files, 11.8 KB, 6 cols)
## br_ibge_censo_demografico/ (33 tables, 4.2 GB, 1058 files)
- **dicionario/** (1 files, 31.0 KB, 5 cols)
- **microdados_domicilio_1970/** (17 files, 37.7 MB, 26 cols)
- **microdados_domicilio_1980/** (17 files, 27.9 MB, 26 cols)
- **microdados_domicilio_1991/** (17 files, 97.5 MB, 43 cols)
- **microdados_domicilio_2000/** (25 files, 130.2 MB, 56 cols)
- **microdados_domicilio_2010/** (50 files, 176.0 MB, 76 cols)
- **microdados_pessoa_1970/** (100 files, 308.6 MB, 41 cols)
- **microdados_pessoa_1980/** (150 files, 380.6 MB, 64 cols)
- **microdados_pessoa_1991/** (150 files, 636.0 MB, 100 cols)
- **microdados_pessoa_2000/** (150 files, 838.2 MB, 110 cols)
- **microdados_pessoa_2010/** (350 files, 1.0 GB, 244 cols)
- **setor_censitario_alfabetizacao_homens_mulheres_2010/** (1 files, 26.9 MB, 172 cols)
- **setor_censitario_alfabetizacao_total_2010/** (1 files, 16.6 MB, 87 cols)
- **setor_censitario_basico_2010/** (1 files, 16.7 MB, 14 cols)
- **setor_censitario_domicilio_caracteristicas_gerais_2010/** (2 files, 32.4 MB, 243 cols)
- **setor_censitario_domicilio_moradores_2010/** (1 files, 32.8 MB, 134 cols)
- **setor_censitario_domicilio_renda_2010/** (1 files, 5.9 MB, 16 cols)
- **setor_censitario_entorno_2010/** (5 files, 175.9 MB, 1064 cols)
- **setor_censitario_idade_homens_2010/** (1 files, 17.4 MB, 136 cols)
- **setor_censitario_idade_mulheres_2010/** (1 files, 17.8 MB, 136 cols)
- **setor_censitario_idade_total_2010/** (1 files, 21.7 MB, 136 cols)
- **setor_censitario_pessoa_renda_2010/** (1 files, 55.6 MB, 134 cols)
- **setor_censitario_raca_alfabetizacao_idade_genero_2010/** (1 files, 18.8 MB, 157 cols)
- **setor_censitario_raca_idade_0_4_genero_2010/** (1 files, 2.5 MB, 12 cols)
- **setor_censitario_raca_idade_genero_2010/** (2 files, 33.1 MB, 253 cols)
- **setor_censitario_registro_civil_2010/** (1 files, 1.4 MB, 5 cols)
- **setor_censitario_relacao_parentesco_conjuges_2010/** (1 files, 18.5 MB, 215 cols)
- **setor_censitario_relacao_parentesco_filhos_2010/** (1 files, 18.7 MB, 206 cols)
- **setor_censitario_relacao_parentesco_filhos_enteados_2010/** (2 files, 15.9 MB, 256 cols)
- **setor_censitario_relacao_parentesco_outros_2010/** (2 files, 12.4 MB, 242 cols)
- **setor_censitario_responsavel_domicilios_homens_total_2010/** (2 files, 24.7 MB, 218 cols)
- **setor_censitario_responsavel_domicilios_mulheres_2010/** (1 files, 9.9 MB, 110 cols)
- **setor_censitario_responsavel_renda_2010/** (1 files, 48.2 MB, 134 cols)
## br_ibge_estadic/ (1 tables, 4.5 KB, 1 files)
- **dicionario/** (1 files, 4.5 KB, 5 cols)
## br_ibge_inpc/ (4 tables, 3.6 MB, 4 files)
- **mes_brasil/** (1 files, 17.8 KB, 8 cols)
- **mes_categoria_brasil/** (1 files, 283.8 KB, 8 cols)
- **mes_categoria_municipio/** (1 files, 1.5 MB, 10 cols)
- **mes_categoria_rm/** (1 files, 1.8 MB, 10 cols)
## br_ibge_ipca/ (4 tables, 3.6 MB, 4 files)
- **mes_brasil/** (1 files, 17.3 KB, 8 cols)
- **mes_categoria_brasil/** (1 files, 286.3 KB, 8 cols)
- **mes_categoria_municipio/** (1 files, 1.5 MB, 10 cols)
- **mes_categoria_rm/** (1 files, 1.8 MB, 10 cols)
## br_ibge_ipca15/ (4 tables, 2.3 MB, 4 files)
- **mes_brasil/** (1 files, 9.4 KB, 8 cols)
- **mes_categoria_brasil/** (1 files, 279.2 KB, 8 cols)
- **mes_categoria_municipio/** (1 files, 411.5 KB, 10 cols)
- **mes_categoria_rm/** (1 files, 1.6 MB, 10 cols)
## br_ibge_pam/ (2 tables, 67.3 MB, 148 files)
- **lavoura_permanente/** (75 files, 32.9 MB, 9 cols)
- **lavoura_temporaria/** (73 files, 34.4 MB, 9 cols)
## br_ibge_pevs/ (2 tables, 3.3 MB, 74 files)
- **producao_extracao_vegetal/** (37 files, 2.5 MB, 7 cols)
- **producao_silvicultura/** (37 files, 805.8 KB, 9 cols)
## br_ibge_pib/ (2 tables, 3.6 MB, 2 files)
- **gini/** (1 files, 24.8 KB, 7 cols)
- **municipio/** (1 files, 3.6 MB, 9 cols)
## br_ibge_pnad/ (3 tables, 116.5 MB, 57 files)
- **dicionario/** (1 files, 3.3 KB, 5 cols)
- **microdados_compatibilizados_domicilio/** (17 files, 19.0 MB, 39 cols)
- **microdados_compatibilizados_pessoa/** (39 files, 97.5 MB, 70 cols)
## br_ibge_pnad_covid/ (1 tables, 7.4 KB, 1 files)
- **dicionario/** (1 files, 7.4 KB, 5 cols)
## br_ibge_pnadc/ (4 tables, 27.4 GB, 1377 files)
- **dicionario/** (1 files, 22.4 KB, 5 cols)
- **educacao/** (39 files, 1.1 GB, 279 cols)
- **microdados/** (1287 files, 25.6 GB, 424 cols)
- **rendimentos_outras_fontes/** (50 files, 707.1 MB, 293 cols)
## br_ibge_pof/ (1 tables, 38.7 KB, 1 files)
- **dicionario/** (1 files, 38.7 KB, 5 cols)
## br_ibge_populacao/ (3 tables, 646.5 KB, 3 files)
- **brasil/** (1 files, 1.1 KB, 2 cols)
- **municipio/** (1 files, 639.4 KB, 4 cols)
- **uf/** (1 files, 6.1 KB, 3 cols)
## br_ibge_ppm/ (4 tables, 12.7 MB, 160 files)
- **efetivo_rebanhos/** (52 files, 5.8 MB, 5 cols)
- **producao_aquicultura/** (10 files, 578.7 KB, 6 cols)
- **producao_origem_animal/** (49 files, 4.5 MB, 7 cols)
- **producao_pecuaria/** (49 files, 1.9 MB, 5 cols)
## br_inep_ana/ (1 tables, 4.1 KB, 1 files)
- **dicionario/** (1 files, 4.1 KB, 5 cols)
## br_inep_avaliacao_alfabetizacao/ (7 tables, 57.2 MB, 17 files)
- **alunos/** (11 files, 56.6 MB, 12 cols)
- **dicionario/** (1 files, 1.8 KB, 5 cols)
- **meta_alfabetizacao_brasil/** (1 files, 4.0 KB, 11 cols)
- **meta_alfabetizacao_municipio/** (1 files, 159.7 KB, 13 cols)
- **meta_alfabetizacao_uf/** (1 files, 6.4 KB, 12 cols)
- **municipio/** (1 files, 411.7 KB, 15 cols)
- **uf/** (1 files, 10.6 KB, 15 cols)
## br_inep_censo_educacao_superior/ (3 tables, 190.4 MB, 129 files)
- **curso/** (111 files, 186.0 MB, 193 cols)
- **dicionario/** (1 files, 2.1 KB, 5 cols)
- **ies/** (17 files, 4.4 MB, 71 cols)
## br_inep_censo_escolar/ (3 tables, 684.8 MB, 340 files)
- **dicionario/** (1 files, 5.1 KB, 5 cols)
- **escola/** (112 files, 243.3 MB, 455 cols)
- **turma/** (227 files, 441.5 MB, 76 cols)
## br_inep_educacao_especial/ (15 tables, 30.7 MB, 145 files)
- **brasil_distorcao_idade_serie/** (1 files, 1.4 KB, 3 cols)
- **brasil_taxa_rendimento/** (1 files, 2.5 KB, 5 cols)
- **distorcao_idade_serie/** (1 files, 5.6 KB, 4 cols)
- **docente_aee/** (1 files, 105.5 KB, 7 cols)
- **docente_formacao/** (1 files, 247.6 KB, 5 cols)
- **etapa_ensino/** (24 files, 6.6 MB, 6 cols)
- **faixa_etaria/** (21 files, 3.5 MB, 6 cols)
- **localizacao/** (22 files, 4.1 MB, 7 cols)
- **matricula_aee/** (1 files, 4.8 KB, 5 cols)
- **sexo_raca_cor/** (23 files, 6.1 MB, 7 cols)
- **taxa_rendimento/** (1 files, 11.0 KB, 6 cols)
- **tempo_ensino/** (22 files, 4.1 MB, 7 cols)
- **tipo_deficiencia/** (24 files, 6.0 MB, 6 cols)
- **uf_distorcao_idade_serie/** (1 files, 4.8 KB, 4 cols)
- **uf_taxa_rendimento/** (1 files, 8.9 KB, 6 cols)
## br_inep_enem/ (28 tables, 7.6 GB, 1631 files)
- **dicionario/** (1 files, 50.9 KB, 5 cols)
- **microdados/** (845 files, 5.8 GB, 63 cols)
- **questionario_socioeconomico_1998/** (1 files, 3.8 MB, 138 cols)
- **questionario_socioeconomico_1999/** (1 files, 7.5 MB, 130 cols)
- **questionario_socioeconomico_2000/** (1 files, 8.3 MB, 128 cols)
- **questionario_socioeconomico_2001/** (50 files, 74.3 MB, 243 cols)
- **questionario_socioeconomico_2002/** (22 files, 68.8 MB, 220 cols)
- **questionario_socioeconomico_2003/** (19 files, 55.7 MB, 189 cols)
- **questionario_socioeconomico_2004/** (17 files, 45.0 MB, 206 cols)
- **questionario_socioeconomico_2005/** (50 files, 95.8 MB, 224 cols)
- **questionario_socioeconomico_2006/** (50 files, 122.5 MB, 224 cols)
- **questionario_socioeconomico_2007/** (50 files, 135.1 MB, 224 cols)
- **questionario_socioeconomico_2008/** (50 files, 125.6 MB, 224 cols)
- **questionario_socioeconomico_2009/** (100 files, 145.6 MB, 294 cols)
- **questionario_socioeconomico_2010/** (17 files, 44.6 MB, 58 cols)
- **questionario_socioeconomico_2011/** (21 files, 65.8 MB, 76 cols)
- **questionario_socioeconomico_2012/** (20 files, 66.1 MB, 63 cols)
- **questionario_socioeconomico_2013/** (50 files, 98.8 MB, 77 cols)
- **questionario_socioeconomico_2014/** (50 files, 115.2 MB, 77 cols)
- **questionario_socioeconomico_2015/** (50 files, 103.8 MB, 51 cols)
- **questionario_socioeconomico_2016/** (50 files, 114.9 MB, 51 cols)
- **questionario_socioeconomico_2017/** (17 files, 58.3 MB, 28 cols)
- **questionario_socioeconomico_2018/** (17 files, 49.8 MB, 28 cols)
- **questionario_socioeconomico_2019/** (17 files, 43.3 MB, 26 cols)
- **questionario_socioeconomico_2020/** (17 files, 47.8 MB, 26 cols)
- **questionario_socioeconomico_2021/** (15 files, 29.2 MB, 26 cols)
- **questionario_socioeconomico_2022/** (16 files, 30.2 MB, 26 cols)
- **questionario_socioeconomico_2023/** (17 files, 34.6 MB, 26 cols)
## br_inep_formacao_docente/ (1 tables, 1.9 KB, 1 files)
- **dicionario/** (1 files, 1.9 KB, 5 cols)
## br_inep_ideb/ (5 tables, 27.0 MB, 9 files)
- **brasil/** (1 files, 8.6 KB, 11 cols)
- **escola/** (5 files, 21.3 MB, 14 cols)
- **municipio/** (1 files, 5.5 MB, 13 cols)
- **regiao/** (1 files, 21.0 KB, 12 cols)
- **uf/** (1 files, 87.7 KB, 12 cols)
## br_inep_indicador_nivel_socioeconomico/ (2 tables, 5.8 MB, 2 files)
- **dicionario/** (1 files, 3.5 KB, 5 cols)
- **escola/** (1 files, 5.8 MB, 18 cols)
## br_inep_indicadores_educacionais/ (11 tables, 395.6 MB, 107 files)
- **brasil/** (19 files, 1.5 MB, 214 cols)
- **brasil_remuneracao_docentes/** (1 files, 11.7 KB, 12 cols)
- **brasil_taxa_transicao/** (1 files, 42.5 KB, 67 cols)
- **escola/** (34 files, 218.8 MB, 208 cols)
- **municipio/** (32 files, 152.6 MB, 215 cols)
- **municipio_taxa_transicao/** (15 files, 19.9 MB, 68 cols)
- **regiao/** (1 files, 526.3 KB, 215 cols)
- **regiao_taxa_transicao/** (1 files, 80.2 KB, 68 cols)
- **uf/** (1 files, 1.8 MB, 215 cols)
- **uf_remuneracao_docentes/** (1 files, 106.9 KB, 13 cols)
- **uf_taxa_transicao/** (1 files, 172.7 KB, 68 cols)
## br_inep_saeb/ (11 tables, 7.9 GB, 1025 files)
- **aluno_ef_2ano/** (3 files, 8.7 MB, 38 cols)
- **aluno_ef_5ano/** (321 files, 2.4 GB, 243 cols)
- **aluno_ef_9ano/** (386 files, 2.6 GB, 267 cols)
- **aluno_em_34ano/** (50 files, 241.0 MB, 105 cols)
- **brasil/** (1 files, 59.0 KB, 17 cols)
- **brasil_taxa_alfabetizacao/** (1 files, 2.1 KB, 5 cols)
- **dicionario/** (1 files, 19.2 KB, 5 cols)
- **municipio/** (11 files, 32.1 MB, 19 cols)
- **proficiencia/** (249 files, 2.6 GB, 21 cols)
- **uf/** (1 files, 1.1 MB, 18 cols)
- **uf_taxa_alfabetizacao/** (1 files, 7.4 KB, 6 cols)
## br_inep_sinopse_estatistica_educacao_basica/ (18 tables, 256.2 MB, 559 files)
- **dicionario/** (1 files, 2.3 KB, 5 cols)
- **docente_deficiencia/** (16 files, 2.8 MB, 6 cols)
- **docente_escolaridade/** (36 files, 19.6 MB, 6 cols)
- **docente_etapa_ensino/** (52 files, 34.4 MB, 7 cols)
- **docente_faixa_etaria_sexo/** (54 files, 36.6 MB, 7 cols)
- **docente_localizacao/** (61 files, 43.0 MB, 7 cols)
- **docente_regime_contrato/** (38 files, 21.5 MB, 7 cols)
- **educacao_especial_etapa_ensino/** (23 files, 2.8 MB, 6 cols)
- **educacao_especial_faixa_etaria/** (19 files, 1.7 MB, 6 cols)
- **educacao_especial_localizacao/** (20 files, 3.8 MB, 7 cols)
- **educacao_especial_sexo_raca_cor/** (22 files, 5.5 MB, 7 cols)
- **educacao_especial_tempo_ensino/** (20 files, 3.8 MB, 7 cols)
- **educacao_especial_tipo_deficiencia/** (22 files, 3.0 MB, 6 cols)
- **etapa_ensino_serie/** (38 files, 23.1 MB, 7 cols)
- **faixa_etaria/** (27 files, 7.0 MB, 6 cols)
- **localizacao/** (35 files, 17.9 MB, 7 cols)
- **sexo_raca_cor/** (43 files, 15.8 MB, 7 cols)
- **tempo_ensino/** (32 files, 14.0 MB, 7 cols)
## br_inmet_bdmep/ (1 tables, 1.3 GB, 210 files)
- **microdados/** (210 files, 1.3 GB, 22 cols)
## br_inpe_prodes/ (1 tables, 862.4 KB, 1 files)
- **municipio_bioma/** (1 files, 862.4 KB, 8 cols)
## br_inpe_queimadas/ (1 tables, 268.1 MB, 65 files)
- **microdados/** (65 files, 268.1 MB, 13 cols)
## br_inpe_sisam/ (1 tables, 1.5 GB, 417 files)
- **microdados/** (417 files, 1.5 GB, 14 cols)
## br_ipea_avs/ (1 tables, 35.5 MB, 1 files)
- **municipio/** (1 files, 35.5 MB, 92 cols)
## br_mdr_snis/ (2 tables, 63.0 MB, 2 files)
- **municipio_agua_esgoto/** (1 files, 31.3 MB, 133 cols)
- **prestador_agua_esgoto/** (1 files, 31.7 MB, 144 cols)
## br_me_caged/ (4 tables, 1.6 GB, 705 files)
- **dicionario/** (1 files, 38.0 KB, 5 cols)
- **microdados_movimentacao/** (689 files, 1.5 GB, 25 cols)
- **microdados_movimentacao_excluida/** (1 files, 5.2 MB, 30 cols)
- **microdados_movimentacao_fora_prazo/** (14 files, 71.1 MB, 27 cols)
## br_me_cno/ (1 tables, 1.9 KB, 1 files)
- **dicionario/** (1 files, 1.9 KB, 5 cols)
## br_me_cnpj/ (5 tables, 194.1 GB, 8473 files)
- **dicionario/** (1 files, 8.2 KB, 5 cols)
- **empresas/** (3070 files, 45.0 GB, 10 cols)
- **estabelecimentos/** (3353 files, 128.6 GB, 35 cols)
- **simples/** (100 files, 283.9 MB, 7 cols)
- **socios/** (1949 files, 20.2 GB, 14 cols)
## br_me_comex_stat/ (5 tables, 1.1 GB, 445 files)
- **dicionario/** (1 files, 11.4 KB, 5 cols)
- **municipio_exportacao/** (83 files, 154.6 MB, 9 cols)
- **municipio_importacao/** (113 files, 217.7 MB, 9 cols)
- **ncm_exportacao/** (106 files, 264.4 MB, 12 cols)
- **ncm_importacao/** (142 files, 501.1 MB, 14 cols)
## br_me_rais/ (3 tables, 51.9 GB, 3541 files)
- **dicionario/** (1 files, 54.5 KB, 5 cols)
- **microdados_estabelecimentos/** (566 files, 803.2 MB, 26 cols)
- **microdados_vinculos/** (2974 files, 51.1 GB, 66 cols)
## br_me_sic/ (2 tables, 200.4 KB, 2 files)
- **dicionario/** (1 files, 11.7 KB, 5 cols)
- **transferencia/** (1 files, 188.7 KB, 17 cols)
## br_me_siconfi/ (7 tables, 441.6 MB, 281 files)
- **municipio_balanco_patrimonial/** (27 files, 51.4 MB, 8 cols)
- **municipio_despesas_funcao/** (60 files, 127.7 MB, 10 cols)
- **municipio_despesas_orcamentarias/** (89 files, 143.9 MB, 10 cols)
- **municipio_receitas_orcamentarias/** (72 files, 115.4 MB, 10 cols)
- **uf_despesas_funcao/** (11 files, 1.4 MB, 10 cols)
- **uf_despesas_orcamentarias/** (11 files, 1.0 MB, 10 cols)
- **uf_receitas_orcamentarias/** (11 files, 792.3 KB, 10 cols)
## br_mec_prouni/ (1 tables, 1.7 KB, 1 files)
- **dicionario/** (1 files, 1.7 KB, 5 cols)
## br_mec_sisu/ (1 tables, 1.4 GB, 314 files)
- **microdados/** (314 files, 1.4 GB, 52 cols)
## br_mg_belohorizonte_smfa_iptu/ (2 tables, 2.4 GB, 221 files)
- **dicionario/** (1 files, 1.7 KB, 5 cols)
- **iptu/** (220 files, 2.4 GB, 26 cols)
## br_mme_consumo_energia_eletrica/ (1 tables, 370.2 KB, 1 files)
- **uf/** (1 files, 370.2 KB, 6 cols)
## br_mp_pep/ (1 tables, 7.4 MB, 7 files)
- **cargos_funcoes/** (7 files, 7.4 MB, 16 cols)
## br_ms_cnes/ (14 tables, 24.4 GB, 1424 files)
- **dados_complementares/** (52 files, 18.5 MB, 94 cols)
- **dicionario/** (1 files, 25.1 KB, 5 cols)
- **equipamento/** (52 files, 863.1 MB, 11 cols)
- **equipe/** (41 files, 105.0 MB, 24 cols)
- **estabelecimento/** (214 files, 1.7 GB, 204 cols)
- **estabelecimento_ensino/** (13 files, 135.6 KB, 14 cols)
- **estabelecimento_filantropico/** (19 files, 320.4 KB, 14 cols)
- **gestao_metas/** (19 files, 746.3 KB, 15 cols)
- **habilitacao/** (41 files, 15.6 MB, 16 cols)
- **incentivos/** (24 files, 3.0 MB, 15 cols)
- **leito/** (41 files, 15.7 MB, 10 cols)
- **profissional/** (820 files, 21.0 GB, 23 cols)
- **regra_contratual/** (17 files, 4.3 MB, 15 cols)
- **servico_especializado/** (70 files, 788.0 MB, 15 cols)
## br_ms_pns/ (3 tables, 51.9 MB, 8 files)
- **dicionario/** (1 files, 36.3 KB, 5 cols)
- **microdados_2013/** (3 files, 20.6 MB, 1000 cols)
- **microdados_2019/** (4 files, 31.2 MB, 1087 cols)
## br_ms_populacao/ (1 tables, 16.3 MB, 9 files)
- **municipio/** (9 files, 16.3 MB, 5 cols)
## br_ms_sia/ (3 tables, 46.2 GB, 7629 files)
- **dicionario/** (1 files, 129.3 KB, 5 cols)
- **producao_ambulatorial/** (7159 files, 45.3 GB, 59 cols)
- **psicossocial/** (469 files, 962.4 MB, 41 cols)
## br_ms_sih/ (3 tables, 31.6 GB, 5824 files)
- **aihs_reduzidas/** (1794 files, 7.6 GB, 109 cols)
- **dicionario/** (1 files, 206.7 KB, 5 cols)
- **servicos_profissionais/** (4029 files, 23.9 GB, 37 cols)
## br_ms_sim/ (2 tables, 872.2 MB, 138 files)
- **dicionario/** (1 files, 6.6 KB, 5 cols)
- **microdados/** (137 files, 872.2 MB, 92 cols)
## br_ms_sinan/ (3 tables, 616.2 MB, 215 files)
- **dicionario/** (1 files, 7.2 KB, 5 cols)
- **microdados_dengue/** (179 files, 503.7 MB, 151 cols)
- **microdados_influenza_srag/** (35 files, 112.5 MB, 205 cols)
## br_ms_sinasc/ (2 tables, 1.4 GB, 352 files)
- **dicionario/** (1 files, 6.4 KB, 5 cols)
- **microdados/** (351 files, 1.4 GB, 66 cols)
## br_ms_sisvan/ (2 tables, 19.2 GB, 1540 files)
- **dicionario/** (1 files, 2.3 KB, 5 cols)
- **microdados/** (1539 files, 19.2 GB, 28 cols)
## br_ms_vacinacao_covid19/ (1 tables, 3.8 KB, 1 files)
- **dicionario/** (1 files, 3.8 KB, 5 cols)
## br_poder360_pesquisas/ (1 tables, 1.3 MB, 1 files)
- **microdados/** (1 files, 1.3 MB, 24 cols)
## br_rf_arrecadacao/ (5 tables, 5.9 MB, 57 files)
- **cnae/** (9 files, 352.9 KB, 20 cols)
- **ir_ipi/** (6 files, 41.0 KB, 10 cols)
- **itr/** (8 files, 3.3 MB, 5 cols)
- **natureza_juridica/** (9 files, 577.9 KB, 20 cols)
- **uf/** (25 files, 1.7 MB, 45 cols)
## br_rf_cafir/ (2 tables, 3.6 GB, 450 files)
- **dicionario/** (1 files, 2.1 KB, 5 cols)
- **imoveis_rurais/** (449 files, 3.6 GB, 14 cols)
## br_rf_cno/ (5 tables, 32.5 GB, 2110 files)
- **areas/** (1236 files, 4.6 GB, 8 cols)
- **cnaes/** (489 files, 2.3 GB, 4 cols)
- **dicionario/** (1 files, 2.1 KB, 5 cols)
- **microdados/** (193 files, 25.1 GB, 25 cols)
- **vinculos/** (191 files, 497.9 MB, 7 cols)
## br_rj_isp_estatisticas_seguranca/ (14 tables, 2.8 MB, 14 files)
- **armas_apreendidas_mensal/** (1 files, 229.4 KB, 42 cols)
- **armas_fogo_apreendidas_mensal/** (1 files, 11.1 KB, 7 cols)
- **evolucao_mensal_cisp/** (1 files, 1005.0 KB, 61 cols)
- **evolucao_mensal_municipio/** (1 files, 361.0 KB, 58 cols)
- **evolucao_mensal_uf/** (1 files, 60.2 KB, 56 cols)
- **evolucao_mensal_upp/** (1 files, 89.2 KB, 38 cols)
- **evolucao_policial_morto_servico_mensal/** (1 files, 50.4 KB, 5 cols)
- **feminicidio_mensal_cisp/** (1 files, 17.4 KB, 9 cols)
- **relacao_cisp_aisp_risp/** (1 files, 7.2 KB, 6 cols)
- **taxa_evolucao_anual_municipio/** (1 files, 75.7 KB, 56 cols)
- **taxa_evolucao_anual_uf/** (1 files, 29.5 KB, 55 cols)
- **taxa_evolucao_mensal_municipio/** (1 files, 852.5 KB, 58 cols)
- **taxa_evolucao_mensal_uf/** (1 files, 59.9 KB, 56 cols)
- **taxa_letalidade/** (1 files, 6.6 KB, 6 cols)
## br_seeg_emissoes/ (3 tables, 2.5 GB, 557 files)
- **dicionario/** (1 files, 13.7 KB, 5 cols)
- **municipio/** (457 files, 2.5 GB, 17 cols)
- **uf/** (99 files, 87.7 MB, 13 cols)
## br_sfb_sicar/ (2 tables, 28.3 GB, 947 files)
- **area_imovel/** (946 files, 28.3 GB, 11 cols)
- **dicionario/** (1 files, 1.7 KB, 5 cols)
## br_simet_educacao_conectada/ (1 tables, 10.0 MB, 1 files)
- **escola/** (1 files, 10.0 MB, 54 cols)
## br_sp_saopaulo_geosampa_iptu/ (1 tables, 2.3 GB, 447 files)
- **iptu/** (447 files, 2.3 GB, 27 cols)
## br_stf_corte_aberta/ (2 tables, 87.3 MB, 33 files)
- **decisoes/** (32 files, 87.3 MB, 17 cols)
- **dicionario/** (1 files, 2.3 KB, 5 cols)
## br_trase_supply_chain/ (6 tables, 59.9 MB, 24 files)
- **beef/** (3 files, 44.2 MB, 22 cols)
- **beef_slaughterhouses/** (1 files, 718.9 KB, 19 cols)
- **soy_beans/** (17 files, 14.8 MB, 25 cols)
- **soy_beans_crushing_facilities/** (1 files, 19.6 KB, 13 cols)
- **soy_beans_refining_facilities/** (1 files, 9.4 KB, 9 cols)
- **soy_beans_storage_facilities/** (1 files, 225.1 KB, 12 cols)
## br_tse_eleicoes/ (22 tables, 8.2 GB, 4324 files)
- **bens_candidato/** (18 files, 134.1 MB, 10 cols)
- **candidatos/** (29 files, 149.3 MB, 28 cols)
- **despesas_candidato/** (255 files, 1.5 GB, 45 cols)
- **detalhes_votacao_municipio/** (16 files, 17.4 MB, 25 cols)
- **detalhes_votacao_municipio_zona/** (16 files, 19.7 MB, 26 cols)
- **detalhes_votacao_secao/** (90 files, 401.2 MB, 24 cols)
- **dicionario/** (1 files, 2.3 KB, 5 cols)
- **partidos/** (21 files, 7.9 MB, 21 cols)
- **perfil_eleitorado_local_votacao/** (15 files, 106.1 MB, 23 cols)
- **perfil_eleitorado_municipio_zona/** (98 files, 128.2 MB, 13 cols)
- **perfil_eleitorado_secao/** (1425 files, 1.9 GB, 15 cols)
- **receitas_candidato/** (108 files, 806.4 MB, 55 cols)
- **receitas_comite/** (7 files, 7.9 MB, 36 cols)
- **receitas_orgao_partidario/** (7 files, 9.0 MB, 50 cols)
- **resultados_candidato/** (37 files, 43.7 MB, 16 cols)
- **resultados_candidato_municipio/** (99 files, 126.6 MB, 16 cols)
- **resultados_candidato_municipio_zona/** (125 files, 187.0 MB, 17 cols)
- **resultados_candidato_secao/** (1367 files, 2.0 GB, 17 cols)
- **resultados_partido_municipio/** (25 files, 17.4 MB, 13 cols)
- **resultados_partido_municipio_zona/** (26 files, 19.6 MB, 14 cols)
- **resultados_partido_secao/** (523 files, 608.5 MB, 15 cols)
- **vagas/** (16 files, 594.7 KB, 9 cols)
## br_tse_filiacao_partidaria/ (2 tables, 941.9 MB, 106 files)
- **microdados/** (44 files, 483.5 MB, 22 cols)
- **microdados_antigos/** (62 files, 458.4 MB, 16 cols)
## dataset_new_arch/ (1 tables, 1.6 KB, 1 files)
- **tabela_new_arch/** (1 files, 1.6 KB, 5 cols)
## logs/ (2 tables, 5.5 GB, 4675 files)
- **cloudaudit_googleapis_com_activity/** (1826 files, 570.1 MB, 18 cols)
- **cloudaudit_googleapis_com_data_access/** (2849 files, 4.9 GB, 18 cols)
## mundo_transfermarkt_competicoes/ (2 tables, 529.9 KB, 25 files)
- **brasileirao_serie_a/** (22 files, 458.6 KB, 35 cols)
- **copa_brasil/** (3 files, 71.3 KB, 38 cols)
## mundo_transfermarkt_competicoes_internacionais/ (1 tables, 114.2 KB, 1 files)
- **champions_league/** (1 files, 114.2 KB, 55 cols)
## test_dataset/ (1 tables, 1.3 KB, 1 files)
- **test_table/** (1 files, 1.3 KB, 4 cols)
## us_harvard_ned/ (2 tables, 26.8 MB, 423 files)
- **parliamentary_elections/** (233 files, 13.4 MB, 238 cols)
- **presidential_elections/** (190 files, 13.4 MB, 325 cols)
## world_ampas_oscar/ (1 tables, 16.7 KB, 1 files)
- **winner_demographics/** (1 files, 16.7 KB, 10 cols)
## world_iea_pirls/ (8 tables, 310.5 MB, 8 files)
- **dictionary/** (1 files, 27.0 KB, 5 cols)
- **home_context/** (1 files, 9.7 MB, 120 cols)
- **school_context/** (1 files, 680.6 KB, 103 cols)
- **student_achievement/** (1 files, 106.5 MB, 864 cols)
- **student_context/** (1 files, 104.2 MB, 157 cols)
- **student_teacher_link/** (1 files, 81.1 MB, 51 cols)
- **teacher_context/** (1 files, 1003.3 KB, 186 cols)
- **within_country_scoring_reliability/** (1 files, 7.2 MB, 1057 cols)
## world_iea_timss/ (11 tables, 484.4 MB, 11 files)
- **dictionary/** (1 files, 15.8 KB, 5 cols)
- **home_context_grade_4/** (1 files, 8.9 MB, 114 cols)
- **school_context_grade_4/** (1 files, 672.5 KB, 111 cols)
- **school_context_grade_8/** (1 files, 478.4 KB, 103 cols)
- **student_achievement_grade_4/** (1 files, 236.3 MB, 110 cols)
- **student_achievement_grade_8/** (1 files, 213.0 MB, 135 cols)
- **student_context_grade_4/** (1 files, 13.0 MB, 127 cols)
- **student_context_grade_8/** (1 files, 9.4 MB, 115 cols)
- **teacher_context_grade_4/** (1 files, 680.6 KB, 87 cols)
- **teacher_mathematics_grade_8/** (1 files, 627.1 KB, 168 cols)
- **teacher_science_grade_8/** (1 files, 1.3 MB, 217 cols)
## world_imdb_movies/ (1 tables, 3.7 MB, 1 files)
- **top_movies_per_year/** (1 files, 3.7 MB, 23 cols)
## world_oecd_pisa/ (1 tables, 742.4 MB, 28 files)
- **student/** (28 files, 742.4 MB, 250 cols)
## world_oecd_public_finance/ (1 tables, 709.9 KB, 1 files)
- **country/** (1 files, 709.9 KB, 163 cols)
## world_olympedia_olympics/ (6 tables, 23.5 MB, 6 files)
- **athlete_bio/** (1 files, 16.5 MB, 11 cols)
- **athlete_event_result/** (1 files, 3.9 MB, 11 cols)
- **country/** (1 files, 3.2 KB, 2 cols)
- **game/** (1 files, 5.7 KB, 10 cols)
- **game_medal_tally/** (1 files, 14.6 KB, 9 cols)
- **result/** (1 files, 3.0 MB, 11 cols)
## world_sofascore_competicoes_futebol/ (2 tables, 1.4 MB, 27 files)
- **brasileirao_serie_a/** (9 files, 603.4 KB, 85 cols)
- **uefa_champions_league/** (18 files, 811.5 KB, 85 cols)
## world_wb_mides/ (9 tables, 37.4 GB, 4645 files)
- **dicionario/** (1 files, 16.5 KB, 5 cols)
- **empenho/** (1488 files, 13.1 GB, 25 cols)
- **licitacao/** (22 files, 160.9 MB, 32 cols)
- **licitacao_item/** (210 files, 2.3 GB, 24 cols)
- **licitacao_participante/** (22 files, 100.5 MB, 17 cols)
- **liquidacao/** (1346 files, 7.2 GB, 20 cols)
- **orgao_unidade_gestora/** (1 files, 2.1 MB, 8 cols)
- **pagamento/** (1522 files, 14.3 GB, 25 cols)
- **relacionamentos/** (33 files, 132.7 MB, 5 cols)
## world_wwf_hydrosheds/ (3 tables, 9.1 GB, 735 files)
- **basins_atlas/** (279 files, 5.6 GB, 296 cols)
- **lakes_atlas/** (132 files, 1.5 GB, 307 cols)
- **rivers_atlas/** (324 files, 2.0 GB, 297 cols)
---
**Total: 533 tables · 675.4 GB · 77885 parquet files**

View File

@@ -1,268 +0,0 @@
import os
import json
import sys
import pyarrow.parquet as pq
import s3fs
import boto3
import duckdb
from dotenv import load_dotenv
load_dotenv()
S3_ENDPOINT = os.environ["HETZNER_S3_ENDPOINT"]
S3_BUCKET = os.environ["HETZNER_S3_BUCKET"]
ACCESS_KEY = os.environ["AWS_ACCESS_KEY_ID"]
SECRET_KEY = os.environ["AWS_SECRET_ACCESS_KEY"]
s3_host = S3_ENDPOINT.removeprefix("https://").removeprefix("http://")
# --- boto3 client (listing only, zero egress) ---
boto = boto3.client(
"s3",
endpoint_url=S3_ENDPOINT,
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
)
# --- s3fs filesystem (footer-only reads via pyarrow) ---
fs = s3fs.S3FileSystem(
client_kwargs={"endpoint_url": S3_ENDPOINT},
key=ACCESS_KEY,
secret=SECRET_KEY,
)
# ------------------------------------------------------------------ #
# Phase 1: File inventory via S3 List API (zero data egress)
# ------------------------------------------------------------------ #
print("Phase 1: listing S3 objects...")
paginator = boto.get_paginator("list_objects_v2")
inventory = {} # "dataset/table" -> {files: [...], total_size: int}
for page in paginator.paginate(Bucket=S3_BUCKET):
for obj in page.get("Contents", []):
key = obj["Key"]
if not key.endswith(".parquet"):
continue
parts = key.split("/")
if len(parts) < 3:
continue
dataset, table = parts[0], parts[1]
dt = f"{dataset}/{table}"
if dt not in inventory:
inventory[dt] = {"files": [], "total_size_bytes": 0}
inventory[dt]["files"].append(key)
inventory[dt]["total_size_bytes"] += obj["Size"]
print(f" Found {len(inventory)} tables across {S3_BUCKET}")
# ------------------------------------------------------------------ #
# Phase 2: Schema reads — footer only (~30 KB per table)
# ------------------------------------------------------------------ #
print("Phase 2: reading parquet footers...")
def fmt_size(b):
for unit in ("B", "KB", "MB", "GB", "TB"):
if b < 1024 or unit == "TB":
return f"{b:.1f} {unit}"
b /= 1024
def extract_col_descriptions(schema):
"""Try to pull per-column descriptions from Arrow metadata."""
descriptions = {}
meta = schema.metadata or {}
# BigQuery exports embed a JSON blob under b'pandas' with column_info
pandas_meta_raw = meta.get(b"pandas") or meta.get(b"pandas_metadata")
if pandas_meta_raw:
try:
pm = json.loads(pandas_meta_raw)
for col in pm.get("columns", []):
name = col.get("name")
desc = col.get("metadata", {}) or {}
if isinstance(desc, dict) and "description" in desc:
descriptions[name] = desc["description"]
except Exception:
pass
# Also try top-level b'description' or b'schema'
for key in (b"description", b"schema", b"BigQuery:description"):
val = meta.get(key)
if val:
try:
descriptions["__table__"] = val.decode("utf-8", errors="replace")
except Exception:
pass
return descriptions
schemas = {}
errors = []
for i, (dt, info) in enumerate(sorted(inventory.items())):
dataset, table = dt.split("/", 1)
first_file = info["files"][0]
s3_path = f"{S3_BUCKET}/{first_file}"
try:
schema = pq.read_schema(fs.open(s3_path))
col_descs = extract_col_descriptions(schema)
# Build raw metadata dict (decode bytes keys/values)
raw_meta = {}
if schema.metadata:
for k, v in schema.metadata.items():
try:
dk = k.decode("utf-8", errors="replace")
dv = v.decode("utf-8", errors="replace")
# Try to parse JSON values
try:
dv = json.loads(dv)
except Exception:
pass
raw_meta[dk] = dv
except Exception:
pass
columns = []
for field in schema:
col = {
"name": field.name,
"type": str(field.type),
"nullable": field.nullable,
}
if field.name in col_descs:
col["description"] = col_descs[field.name]
# Check field-level metadata
if field.metadata:
for k, v in field.metadata.items():
try:
dk = k.decode("utf-8", errors="replace")
dv = v.decode("utf-8", errors="replace")
if dk in ("description", "DESCRIPTION", "comment"):
col["description"] = dv
except Exception:
pass
columns.append(col)
schemas[f"{dataset}.{table}"] = {
"path": f"s3://{S3_BUCKET}/{dataset}/{table}/",
"file_count": len(info["files"]),
"total_size_bytes": info["total_size_bytes"],
"total_size_human": fmt_size(info["total_size_bytes"]),
"columns": columns,
"metadata": raw_meta,
}
print(f" [{i+1}/{len(inventory)}] ✓ {dataset}.{table} ({len(columns)} cols, {fmt_size(info['total_size_bytes'])})")
except Exception as e:
errors.append({"table": f"{dataset}.{table}", "error": str(e)})
print(f" [{i+1}/{len(inventory)}] ✗ {dataset}.{table}: {e}", file=sys.stderr)
# ------------------------------------------------------------------ #
# Phase 3: Enrich from br_bd_metadados.bigquery_tables (small table)
# ------------------------------------------------------------------ #
META_TABLE = "br_bd_metadados.bigquery_tables"
meta_dt = "br_bd_metadados/bigquery_tables"
if meta_dt in inventory:
print(f"Phase 3: enriching from {META_TABLE}...")
try:
con = duckdb.connect()
con.execute("INSTALL httpfs; LOAD httpfs;")
con.execute(f"""
SET s3_endpoint='{s3_host}';
SET s3_access_key_id='{ACCESS_KEY}';
SET s3_secret_access_key='{SECRET_KEY}';
SET s3_url_style='path';
""")
meta_path = f"s3://{S3_BUCKET}/br_bd_metadados/bigquery_tables/*.parquet"
# Peek at available columns
available = [r[0] for r in con.execute(f"DESCRIBE SELECT * FROM '{meta_path}' LIMIT 1").fetchall()]
print(f" Metadata columns: {available}")
# Try to find dataset/table description columns
desc_col = next((c for c in available if "description" in c.lower()), None)
ds_col = next((c for c in available if c.lower() in ("dataset_id", "dataset", "schema_name")), None)
tbl_col = next((c for c in available if c.lower() in ("table_id", "table_name", "table")), None)
if desc_col and ds_col and tbl_col:
rows = con.execute(f"""
SELECT {ds_col}, {tbl_col}, {desc_col}
FROM '{meta_path}'
""").fetchall()
for ds, tbl, desc in rows:
key = f"{ds}.{tbl}"
if key in schemas and desc:
schemas[key]["table_description"] = desc
print(f" Enriched {len(rows)} table descriptions")
else:
print(f" Could not find expected columns (dataset_id, table_id, description) — skipping enrichment")
con.close()
except Exception as e:
print(f" Enrichment failed: {e}", file=sys.stderr)
else:
print("Phase 3: br_bd_metadados.bigquery_tables not in S3 — skipping enrichment")
# ------------------------------------------------------------------ #
# Phase 4a: Write schemas.json
# ------------------------------------------------------------------ #
print("Phase 4: writing outputs...")
output = {
"_meta": {
"bucket": S3_BUCKET,
"total_tables": len(schemas),
"total_size_bytes": sum(v["total_size_bytes"] for v in schemas.values()),
"total_size_human": fmt_size(sum(v["total_size_bytes"] for v in schemas.values())),
"errors": errors,
},
"tables": dict(sorted(schemas.items())),
}
with open("schemas.json", "w", encoding="utf-8") as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f" ✓ schemas.json ({len(schemas)} tables)")
# ------------------------------------------------------------------ #
# Phase 4b: Write file_tree.md
# ------------------------------------------------------------------ #
lines = [
f"# S3 File Tree: {S3_BUCKET}",
"",
]
# Group by dataset
datasets_map = {}
for dt_key, info in sorted(inventory.items()):
dataset, table = dt_key.split("/", 1)
datasets_map.setdefault(dataset, []).append((table, info))
total_files = sum(len(v["files"]) for v in inventory.values())
total_bytes = sum(v["total_size_bytes"] for v in inventory.values())
for dataset, tables in sorted(datasets_map.items()):
ds_bytes = sum(i["total_size_bytes"] for _, i in tables)
ds_files = sum(len(i["files"]) for _, i in tables)
lines.append(f"## {dataset}/ ({len(tables)} tables, {fmt_size(ds_bytes)}, {ds_files} files)")
lines.append("")
for table, info in sorted(tables):
schema_entry = schemas.get(f"{dataset}.{table}", {})
ncols = len(schema_entry.get("columns", []))
col_str = f", {ncols} cols" if ncols else ""
table_desc = schema_entry.get("table_description", "")
desc_str = f"{table_desc}" if table_desc else ""
lines.append(f" - **{table}/** ({len(info['files'])} files, {fmt_size(info['total_size_bytes'])}{col_str}){desc_str}")
lines.append("")
lines += [
"---",
f"**Total: {len(inventory)} tables · {fmt_size(total_bytes)} · {total_files} parquet files**",
]
with open("file_tree.md", "w", encoding="utf-8") as f:
f.write("\n".join(lines) + "\n")
print(f" ✓ file_tree.md ({len(inventory)} tables)")
print()
print("Done!")
print(f" schemas.json — full column-level schema dump")
print(f" file_tree.md — bucket tree with sizes")
if errors:
print(f" {len(errors)} tables failed (see schemas.json _meta.errors)")

View File

@@ -1,10 +1,25 @@
name: basedosdados
server: 89.167.95.136
server: haloy.xn--2dk.xyz
api_token:
from:
env: HALOY_TOKEN
domains:
- domain: db.xn--2dk.xyz
port: 8080
health_check_path: /health
env:
- HETZNER_S3_ENDPOINT
- AWS_ACCESS_KEY_ID
- AWS_SECRET_ACCESS_KEY
- BASIC_AUTH_HASH
- name: HETZNER_S3_ENDPOINT
from:
env: HETZNER_S3_ENDPOINT
- name: AWS_ACCESS_KEY_ID
from:
env: AWS_ACCESS_KEY_ID
- name: AWS_SECRET_ACCESS_KEY
from:
env: AWS_SECRET_ACCESS_KEY
- name: BASIC_AUTH_PASSWORD
from:
env: BASIC_AUTH_PASSWORD
- name: BUCKET_REGION
from:
env: BUCKET_REGION

View File

@@ -1,6 +0,0 @@
#!/bin/bash
cd "$(dirname "$0")"
INIT=$(mktemp /tmp/duckdb_init_XXXX)
printf "LOAD httpfs;\nATTACH 'basedosdados.duckdb' AS bd (READ_ONLY);\n" > "$INIT"
duckdb --ui ui.duckdb -init "$INIT"
rm -f "$INIT"

View File

@@ -1,6 +1,7 @@
import os
import duckdb
import boto3
from collections import defaultdict
from dotenv import load_dotenv
load_dotenv()
@@ -13,46 +14,51 @@ SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
# DuckDB expects the endpoint without scheme
s3_endpoint = ENDPOINT_URL.removeprefix('https://').removeprefix('http://')
# Lista todos os prefixos no bucket (dataset/tabela)
# Lista todos os objetos do bucket de uma vez, agrupando por dataset/tabela
s3 = boto3.client('s3',
endpoint_url=ENDPOINT_URL,
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY)
paginator = s3.get_paginator('list_objects_v2')
datasets = {}
for page in paginator.paginate(Bucket=BUCKET, Delimiter='/'):
for prefix in page.get('CommonPrefixes', []):
dataset = prefix['Prefix'].rstrip('/')
datasets[dataset] = []
for page2 in paginator.paginate(Bucket=BUCKET,
Prefix=dataset+'/',
Delimiter='/'):
for p in page2.get('CommonPrefixes', []):
table = p['Prefix'].rstrip('/').split('/')[-1]
datasets[dataset].append(table)
table_files = defaultdict(lambda: defaultdict(list))
for page in paginator.paginate(Bucket=BUCKET):
for obj in page.get('Contents', []):
key = obj['Key']
if not key.endswith('.parquet'):
continue
parts = key.split('/')
if len(parts) >= 3:
dataset, table = parts[0], parts[1]
table_files[dataset][table].append(f"s3://{BUCKET}/{key}")
# Cria conexão DuckDB e configura S3
con = duckdb.connect('basedosdados3.duckdb')
con = duckdb.connect('basedosdados.duckdb')
con.execute("INSTALL httpfs; LOAD httpfs;")
con.execute(f"""
SET s3_endpoint='{s3_endpoint}';
SET s3_access_key_id='{ACCESS_KEY}';
SET s3_secret_access_key='{SECRET_KEY}';
SET s3_url_style='path';
SET enable_object_cache=true;
SET threads=4;
SET memory_limit='6GB';
SET preserve_insertion_order=false;
SET http_keep_alive=true;
SET http_retries=3;
""")
# Cria schemas e views
for dataset, tables in datasets.items():
# Cria schemas e views com lista explícita de arquivos
for dataset, tables in table_files.items():
con.execute(f"CREATE SCHEMA IF NOT EXISTS {dataset}")
for table in tables:
path = f"s3://{BUCKET}/{dataset}/{table}/*.parquet"
for table, files in tables.items():
file_list = ", ".join(f"'{f}'" for f in sorted(files))
try:
con.execute(f"""
CREATE OR REPLACE VIEW {dataset}.{table} AS
SELECT * FROM read_parquet('{path}', hive_partitioning=true)
SELECT * FROM read_parquet([{file_list}], hive_partitioning=true, union_by_name=true)
""")
print(f"{dataset}.{table}")
print(f"{dataset}.{table} ({len(files)} files)")
except Exception as e:
if 'Geoparquet' in str(e) or 'geometria' in str(e) or 'geometry' in str(e).lower():
print(f" skip (geoparquet) {dataset}.{table}")
@@ -60,4 +66,4 @@ for dataset, tables in datasets.items():
raise
con.close()
print("Done! Open with: duckdb --ui basedosdados3.duckdb")
print("Done! Open with: duckdb --ui basedosdados.duckdb")

147562
schemas.json

File diff suppressed because one or more lines are too long

View File

@@ -1,20 +1,28 @@
#!/bin/bash
set -euo pipefail
# DuckDB init: load S3 credentials from env at session start
INIT=$(mktemp /tmp/duckdb_init_XXXX.sql)
S3_ENDPOINT="${HETZNER_S3_ENDPOINT#https://}"
S3_ENDPOINT="${S3_ENDPOINT#http://}"
cat > "$INIT" <<SQL
INSTALL httpfs; LOAD httpfs;
# Init SQL para o terminal web (credenciais não ficam expostas como env vars)
cat > /app/ssh_init.sql <<SQL
LOAD httpfs;
SET s3_endpoint='${S3_ENDPOINT}';
SET s3_access_key_id='${AWS_ACCESS_KEY_ID}';
SET s3_secret_access_key='${AWS_SECRET_ACCESS_KEY}';
SET s3_region='${BUCKET_REGION}';
SET s3_url_style='path';
SET enable_object_cache=true;
SET threads=4;
SET memory_limit='4GB';
SQL
chmod 600 /app/ssh_init.sql
echo "[start] Starting ttyd terminal..."
ttyd --port 7681 --writable duckdb -readonly --init /app/ssh_init.sql /app/basedosdados.duckdb &
echo "[start] Starting auth service..."
python3 /app/auth.py &
echo "[start] Starting Caddy..."
caddy start --config /app/Caddyfile --adapter caddyfile
echo "[start] Starting DuckDB UI..."
exec duckdb --ui -init "$INIT" basedosdados.duckdb
exec caddy run --config /app/Caddyfile --adapter caddyfile