refactor: reorganize project structure and fix broken references
- Move scripts to scripts/ directory (roda.sh, prepara_db.py, etc.) - Move shell config to shell/ directory (Caddyfile, auth.py, haloy.yml) - Move basedosdados.duckdb to data/ directory - Update Dockerfile and start.sh with new file paths - Update README.md with correct script paths - Remove Python ask.py (replaced by Rust binary in ask/ask) - Add Rust source files (schema_filter.rs, sql_generator.rs, table_selector.rs) - Remove sentence-transformer dependencies from ask - Move docs and context artifacts to their directories
This commit is contained in:
45
docs/wordcloud_attributes.py
Normal file
45
docs/wordcloud_attributes.py
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import re
|
||||
from collections import Counter
|
||||
from wordcloud import WordCloud
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
STOPWORDS = {'de', 'do', 'da', 'a', 'ou', 'em', 'e', 'o', 'que', 'das', 'dos', 'nos', 'nas', 'um', 'uma', 'para', 'com', 'não', 'uma', 'à', 'ao', 'os', 'as', 'se', 'na', 'no', 'de', 'do', 'da', 'é', 'ser', 'seu', 'sua', 'isso', 'the', 'of', 'and', 'in', 'to', 'is', 'for', 'on', 'with', 'at', 'by', 'from'}
|
||||
|
||||
with open('context/basedosdados-schema.json') as f:
|
||||
schema = json.load(f)
|
||||
|
||||
words = []
|
||||
for dataset, tables in schema.items():
|
||||
for table, cols in tables.items():
|
||||
for col in cols:
|
||||
name = col.get('name', '').lower()
|
||||
desc = col.get('description', '').lower()
|
||||
if name and len(name) >= 3:
|
||||
words.append(name)
|
||||
if desc:
|
||||
for w in desc.split():
|
||||
w = re.sub(r'[^a-záàâãéèêíìîóòôõúùûç]', '', w)
|
||||
if len(w) >= 3 and w not in STOPWORDS:
|
||||
words.append(w)
|
||||
|
||||
word_freq = Counter(words)
|
||||
|
||||
wc = WordCloud(
|
||||
width=1600,
|
||||
height=800,
|
||||
background_color='white',
|
||||
max_words=200,
|
||||
colormap='viridis',
|
||||
min_font_size=8
|
||||
).generate_from_frequencies(word_freq)
|
||||
|
||||
plt.figure(figsize=(20, 10))
|
||||
plt.imshow(wc, interpolation='bilinear')
|
||||
plt.axis('off')
|
||||
plt.tight_layout(pad=0)
|
||||
plt.savefig('docs/wordcloud_attributes.png', dpi=150, bbox_inches='tight')
|
||||
print("Saved docs/wordcloud_attributes.png")
|
||||
print(f"Total unique words: {len(word_freq)}")
|
||||
print("Top 30:", word_freq.most_common(30))
|
||||
Reference in New Issue
Block a user