Files
baseldosdados/docs/dataviz/wordcloud_attributes.py
rafapolo 2446a0f78d docs: add census documentation and final research report
- Add 37 census documentation files for IBGE census datasets (1970-2010)
- Add dataviz wordcloud scripts and images
- Add relatorio_final.md with research findings on households and living conditions

New data from DuckDB queries:
- 90.7M households, 203M population
- 53.2% Black population
- 27.9% female-headed households
- 46.6% urban sewage without collection/treatment
- 15,816 favela sectors (2010)
- 68% Black population in Fortaleza
2026-03-30 22:03:55 +02:00

46 lines
1.6 KiB
Python

#!/usr/bin/env python3
import json
import re
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
STOPWORDS = {'de', 'do', 'da', 'a', 'ou', 'em', 'e', 'o', 'que', 'das', 'dos', 'nos', 'nas', 'um', 'uma', 'para', 'com', 'não', 'uma', 'à', 'ao', 'os', 'as', 'se', 'na', 'no', 'de', 'do', 'da', 'é', 'ser', 'seu', 'sua', 'isso', 'the', 'of', 'and', 'in', 'to', 'is', 'for', 'on', 'with', 'at', 'by', 'from'}
with open('context/basedosdados-schema.json') as f:
schema = json.load(f)
words = []
for dataset, tables in schema.items():
for table, cols in tables.items():
for col in cols:
name = col.get('name', '').lower()
desc = col.get('description', '').lower()
if name and len(name) >= 3:
words.append(name)
if desc:
for w in desc.split():
w = re.sub(r'[^a-záàâãéèêíìîóòôõúùûç]', '', w)
if len(w) >= 3 and w not in STOPWORDS:
words.append(w)
word_freq = Counter(words)
wc = WordCloud(
width=1600,
height=800,
background_color='white',
max_words=200,
colormap='viridis',
min_font_size=8
).generate_from_frequencies(word_freq)
plt.figure(figsize=(20, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.savefig('docs/wordcloud_attributes.png', dpi=150, bbox_inches='tight')
print("Saved docs/wordcloud_attributes.png")
print(f"Total unique words: {len(word_freq)}")
print("Top 30:", word_freq.most_common(30))