refactor(wordlist): filter out all words not of specified length

This commit is contained in:
Christoph J. Scherr 2024-07-25 17:36:35 +02:00
parent 8f96f6662a
commit 47c2c9e892
3 changed files with 60 additions and 34 deletions

View File

@ -5,7 +5,7 @@ use serde_json;
use crate::error::WordlistError;
use super::{Word, WordList};
use super::{Word, WordList, WordMapInner};
pub const RAW_WORDLIST_BUNDLED_ENGLISH: &str =
include_str!("../../data/wordlists/en_US_3b1b_freq_map.json");
@ -18,6 +18,7 @@ pub const RAW_WORDLIST_PATH_GERMAN_SMALL: &str = "../../data/wordlists/german_SU
#[derive(Clone)]
pub struct BuiltinWList {
words: super::WordMap,
name: String,
}
impl BuiltinWList {
@ -33,31 +34,48 @@ impl BuiltinWList {
///
/// Where the number is the frequency. Higher/Lower case is ignored.
///
/// Only words with the specified length will be included.
///
/// ## Errors
///
/// Will fail if the file path cannot be read or the format is wrong.
pub fn load<P: AsRef<std::path::Path>>(wl_path: P) -> Result<Self, WordlistError> {
pub fn load<P: AsRef<std::path::Path>>(wl_path: P, len: usize) -> Result<Self, WordlistError> {
let path: &Path = wl_path.as_ref();
let file = std::fs::File::open(path)?;
// don't load the whole string into memory
let reader = std::io::BufReader::new(file);
let words: super::WordMap = serde_json::from_reader(reader)?;
let mut words: super::WordMap = serde_json::from_reader(reader)?;
words.only_words_with_len(len);
Ok(Self { words })
let name: String = if let Some(osstr) = path.file_name() {
osstr.to_str().unwrap_or("(no name)").to_string()
} else {
"(no name)".to_string()
};
Ok(Self { words, name })
}
pub fn english() -> Self {
let words: super::WordMap = serde_json::from_str(RAW_WORDLIST_BUNDLED_ENGLISH).unwrap();
pub fn english(len: usize) -> Self {
let mut words: super::WordMap = serde_json::from_str(RAW_WORDLIST_BUNDLED_ENGLISH).unwrap();
words.only_words_with_len(len);
Self { words }
Self {
words,
name: "(builtin english)".to_string(),
}
}
pub fn german() -> Self {
let words: super::WordMap =
pub fn german(len: usize) -> Self {
let mut words: super::WordMap =
serde_json::from_str(RAW_WORDLIST_BUNDLED_GERMAN_SMALL).unwrap();
words.only_words_with_len(len);
Self { words }
Self {
words,
name: "(builtin german)".to_string(),
}
}
}
@ -75,36 +93,27 @@ impl super::WordList for BuiltinWList {
impl Default for BuiltinWList {
fn default() -> Self {
Self::english()
Self::english(5)
}
}
impl Debug for BuiltinWList {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write(
f,
format_args!(
"BuiltinWList {{ \n\
\tamount: {}, \n\
\ttotal_freq: {}, \n\
\tcommon: {}, \n\
\tthreshold: {}, \n\
\tfreq_range: {:?}, \n\
\tover_threshold: {:#?}, \n\
}}",
self.amount(),
self.total_freq(),
self.wordmap().n_common(),
self.wordmap().threshold(),
self.wordmap().freq_range(),
self.over_threashold()
),
)
f.debug_struct("BuiltinWList")
.field("name", &self.name)
.field("words", &self.words)
.finish()
}
}
impl Display for BuiltinWList {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{self:#?}")
writeln!(
f,
"{}:\nwords:\t{}\ntop 5:\t{:?}",
self.name,
self.len(),
self.n_most_likely(5)
)
}
}

View File

@ -11,7 +11,7 @@ pub mod builtin;
pub mod word;
use word::*;
use crate::error::{Error, WResult, WordlistError};
use crate::error::{WResult, WordlistError};
pub type AnyWordlist = Box<dyn WordList>;
@ -35,9 +35,14 @@ pub trait WordList: Clone + std::fmt::Debug + Default + Sync + Display {
(w.0.clone(), *w.1)
}
fn length_range(&self) -> impl RangeBounds<usize>;
fn amount(&self) -> usize {
#[must_use]
fn len(&self) -> usize {
self.solutions().len()
}
#[must_use]
fn is_empty(&self) -> bool {
self.solutions().len() == 0
}
fn wordmap(&self) -> &WordMap;
fn total_freq(&self) -> Frequency {
self.wordmap().values().map(|a| a.to_owned()).sum()

View File

@ -11,12 +11,13 @@ pub type Frequency = f64;
pub type Word = String;
pub type WordData = (Word, Frequency);
pub type WordDataRef<'wl> = (&'wl Word, &'wl Frequency);
pub(crate) type WordMapInner = HashMap<Word, Frequency>;
#[derive(Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct WordMap {
#[serde(flatten)]
inner: HashMap<Word, Frequency>,
inner: WordMapInner,
}
impl Default for WordMap {
@ -71,6 +72,9 @@ impl WordMap {
pub fn inner(&self) -> &HashMap<Word, Frequency> {
&self.inner
}
pub fn inner_mut(&mut self) -> &mut HashMap<Word, Frequency> {
&mut self.inner
}
pub fn get<I: std::fmt::Display>(&self, word: I) -> Option<WordData> {
self.inner
.get(&word.to_string())
@ -82,6 +86,14 @@ impl WordMap {
abs.into_iter().map(|p| (p.0, p.1 as f64 / n)).collect();
relative.into()
}
pub fn only_words_with_len(&mut self, len: usize) {
self.inner = self
.inner
.iter()
.filter(|a| a.0.len() == len)
.map(|a| (a.0.to_owned(), *a.1))
.collect::<WordMapInner>();
}
}
impl std::fmt::Debug for WordMap {