generated from PlexSheep/rs-base
19 lines
642 B
Python
19 lines
642 B
Python
|
import json
|
||
|
|
||
|
# Load the word frequency dictionary
|
||
|
with open('../data/wordlists/german_SUBTLEX-DE_full.json', 'r') as f:
|
||
|
word_freqs = json.load(f)
|
||
|
|
||
|
# Set a frequency threshold (e.g., 0.001)
|
||
|
freq_threshold = 0.000001
|
||
|
|
||
|
# Set a maximum word length (e.g., 10)
|
||
|
max_word_length = 10
|
||
|
|
||
|
# Filter out words with low frequency and long length
|
||
|
filtered_word_freqs = {word: freq for word, freq in word_freqs.items() if freq >= freq_threshold and len(word) <= max_word_length}
|
||
|
|
||
|
# Save the filtered word frequencies to a new JSON file
|
||
|
with open('../data/wordlists/german_SUBTLEX-DE_small.json', 'w') as f:
|
||
|
json.dump(filtered_word_freqs, f, indent=4)
|