feat(german): add german wordlist with data from SUBTLEX-DE
cargo devel CI / cargo CI (push) Has been cancelled Details

This commit is contained in:
Christoph J. Scherr 2024-07-25 16:31:42 +02:00
parent 82e0b49434
commit d651756265
3 changed files with 215174 additions and 0 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,18 @@
import json
# Load the word frequency dictionary
with open('../data/wordlists/german_SUBTLEX-DE_full.json', 'r') as f:
word_freqs = json.load(f)
# Set a frequency threshold (e.g., 0.001)
freq_threshold = 0.000001
# Set a maximum word length (e.g., 10)
max_word_length = 10
# Filter out words with low frequency and long length
filtered_word_freqs = {word: freq for word, freq in word_freqs.items() if freq >= freq_threshold and len(word) <= max_word_length}
# Save the filtered word frequencies to a new JSON file
with open('../data/wordlists/german_SUBTLEX-DE_small.json', 'w') as f:
json.dump(filtered_word_freqs, f, indent=4)