docs: add census documentation and final research report

- Add 37 census documentation files for IBGE census datasets (1970-2010) - Add dataviz wordcloud scripts and images - Add relatorio_final.md with research findings on households and living conditions New data from DuckDB queries: - 90.7M households, 203M population - 53.2% Black population - 27.9% female-headed households - 46.6% urban sewage without collection/treatment - 15,816 favela sectors (2010) - 68% Black population in Fortaleza
2026-03-30 22:03:55 +02:00
parent e9b680379b
commit 2446a0f78d
40 changed files with 4874 additions and 0 deletions
--- a/docs/dataviz/wordcloud_attributes.png
+++ b/docs/dataviz/wordcloud_attributes.png
--- a/docs/dataviz/wordcloud_attributes.py
+++ b/docs/dataviz/wordcloud_attributes.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+import json
+import re
+from collections import Counter
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+
+STOPWORDS = {'de', 'do', 'da', 'a', 'ou', 'em', 'e', 'o', 'que', 'das', 'dos', 'nos', 'nas', 'um', 'uma', 'para', 'com', 'não', 'uma', 'à', 'ao', 'os', 'as', 'se', 'na', 'no', 'de', 'do', 'da', 'é', 'ser', 'seu', 'sua', 'isso', 'the', 'of', 'and', 'in', 'to', 'is', 'for', 'on', 'with', 'at', 'by', 'from'}
+
+with open('context/basedosdados-schema.json') as f:
+    schema = json.load(f)
+
+words = []
+for dataset, tables in schema.items():
+    for table, cols in tables.items():
+        for col in cols:
+            name = col.get('name', '').lower()
+            desc = col.get('description', '').lower()
+            if name and len(name) >= 3:
+                words.append(name)
+            if desc:
+                for w in desc.split():
+                    w = re.sub(r'[^a-záàâãéèêíìîóòôõúùûç]', '', w)
+                    if len(w) >= 3 and w not in STOPWORDS:
+                        words.append(w)
+
+word_freq = Counter(words)
+
+wc = WordCloud(
+    width=1600, 
+    height=800, 
+    background_color='white',
+    max_words=200,
+    colormap='viridis',
+    min_font_size=8
+).generate_from_frequencies(word_freq)
+
+plt.figure(figsize=(20, 10))
+plt.imshow(wc, interpolation='bilinear')
+plt.axis('off')
+plt.tight_layout(pad=0)
+plt.savefig('docs/wordcloud_attributes.png', dpi=150, bbox_inches='tight')
+print("Saved docs/wordcloud_attributes.png")
+print(f"Total unique words: {len(word_freq)}")
+print("Top 30:", word_freq.most_common(30))
--- a/docs/dataviz/wordcloud_datasets.png
+++ b/docs/dataviz/wordcloud_datasets.png
--- a/docs/dataviz/wordcloud_datasets.py
+++ b/docs/dataviz/wordcloud_datasets.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+import json
+from collections import Counter
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+
+with open('context/basedosdados-schema.json') as f:
+    schema = json.load(f)
+
+dataset_names = []
+for dataset in schema.keys():
+    parts = dataset.replace('br_', '').replace('mundo_', '').replace('eu_', '').split('_')
+    dataset_names.extend([p for p in parts if len(p) >= 3])
+
+word_freq = Counter(dataset_names)
+
+wc = WordCloud(
+    width=1600, 
+    height=800, 
+    background_color='white',
+    max_words=100,
+    colormap='plasma',
+    min_font_size=10
+).generate_from_frequencies(word_freq)
+
+plt.figure(figsize=(20, 10))
+plt.imshow(wc, interpolation='bilinear')
+plt.axis('off')
+plt.tight_layout(pad=0)
+plt.savefig('docs/wordcloud_datasets.png', dpi=150, bbox_inches='tight')
+print("Saved docs/wordcloud_datasets.png")
+print(f"Total unique words: {len(word_freq)}")
+print("Top 30:", word_freq.most_common(30))