From 47c2c9e892cf50ed5d5d7e6ddd6c10f607647891 Mon Sep 17 00:00:00 2001 From: "Christoph J. Scherr" Date: Thu, 25 Jul 2024 17:36:35 +0200 Subject: [PATCH] refactor(wordlist): filter out all words not of specified length --- src/wlist/builtin.rs | 71 +++++++++++++++++++++++++------------------- src/wlist/mod.rs | 9 ++++-- src/wlist/word.rs | 14 ++++++++- 3 files changed, 60 insertions(+), 34 deletions(-) diff --git a/src/wlist/builtin.rs b/src/wlist/builtin.rs index bd509ed..4ee4135 100644 --- a/src/wlist/builtin.rs +++ b/src/wlist/builtin.rs @@ -5,7 +5,7 @@ use serde_json; use crate::error::WordlistError; -use super::{Word, WordList}; +use super::{Word, WordList, WordMapInner}; pub const RAW_WORDLIST_BUNDLED_ENGLISH: &str = include_str!("../../data/wordlists/en_US_3b1b_freq_map.json"); @@ -18,6 +18,7 @@ pub const RAW_WORDLIST_PATH_GERMAN_SMALL: &str = "../../data/wordlists/german_SU #[derive(Clone)] pub struct BuiltinWList { words: super::WordMap, + name: String, } impl BuiltinWList { @@ -33,31 +34,48 @@ impl BuiltinWList { /// /// Where the number is the frequency. Higher/Lower case is ignored. /// + /// Only words with the specified length will be included. + /// /// ## Errors /// /// Will fail if the file path cannot be read or the format is wrong. - pub fn load>(wl_path: P) -> Result { + pub fn load>(wl_path: P, len: usize) -> Result { let path: &Path = wl_path.as_ref(); let file = std::fs::File::open(path)?; // don't load the whole string into memory let reader = std::io::BufReader::new(file); - let words: super::WordMap = serde_json::from_reader(reader)?; + let mut words: super::WordMap = serde_json::from_reader(reader)?; + words.only_words_with_len(len); - Ok(Self { words }) + let name: String = if let Some(osstr) = path.file_name() { + osstr.to_str().unwrap_or("(no name)").to_string() + } else { + "(no name)".to_string() + }; + + Ok(Self { words, name }) } - pub fn english() -> Self { - let words: super::WordMap = serde_json::from_str(RAW_WORDLIST_BUNDLED_ENGLISH).unwrap(); + pub fn english(len: usize) -> Self { + let mut words: super::WordMap = serde_json::from_str(RAW_WORDLIST_BUNDLED_ENGLISH).unwrap(); + words.only_words_with_len(len); - Self { words } + Self { + words, + name: "(builtin english)".to_string(), + } } - pub fn german() -> Self { - let words: super::WordMap = + pub fn german(len: usize) -> Self { + let mut words: super::WordMap = serde_json::from_str(RAW_WORDLIST_BUNDLED_GERMAN_SMALL).unwrap(); + words.only_words_with_len(len); - Self { words } + Self { + words, + name: "(builtin german)".to_string(), + } } } @@ -75,36 +93,27 @@ impl super::WordList for BuiltinWList { impl Default for BuiltinWList { fn default() -> Self { - Self::english() + Self::english(5) } } impl Debug for BuiltinWList { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write( - f, - format_args!( - "BuiltinWList {{ \n\ - \tamount: {}, \n\ - \ttotal_freq: {}, \n\ - \tcommon: {}, \n\ - \tthreshold: {}, \n\ - \tfreq_range: {:?}, \n\ - \tover_threshold: {:#?}, \n\ - }}", - self.amount(), - self.total_freq(), - self.wordmap().n_common(), - self.wordmap().threshold(), - self.wordmap().freq_range(), - self.over_threashold() - ), - ) + f.debug_struct("BuiltinWList") + .field("name", &self.name) + .field("words", &self.words) + .finish() } } impl Display for BuiltinWList { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{self:#?}") + writeln!( + f, + "{}:\nwords:\t{}\ntop 5:\t{:?}", + self.name, + self.len(), + self.n_most_likely(5) + ) } } diff --git a/src/wlist/mod.rs b/src/wlist/mod.rs index fb36b70..e529758 100644 --- a/src/wlist/mod.rs +++ b/src/wlist/mod.rs @@ -11,7 +11,7 @@ pub mod builtin; pub mod word; use word::*; -use crate::error::{Error, WResult, WordlistError}; +use crate::error::{WResult, WordlistError}; pub type AnyWordlist = Box; @@ -35,9 +35,14 @@ pub trait WordList: Clone + std::fmt::Debug + Default + Sync + Display { (w.0.clone(), *w.1) } fn length_range(&self) -> impl RangeBounds; - fn amount(&self) -> usize { + #[must_use] + fn len(&self) -> usize { self.solutions().len() } + #[must_use] + fn is_empty(&self) -> bool { + self.solutions().len() == 0 + } fn wordmap(&self) -> &WordMap; fn total_freq(&self) -> Frequency { self.wordmap().values().map(|a| a.to_owned()).sum() diff --git a/src/wlist/word.rs b/src/wlist/word.rs index 22d666b..c39e51d 100644 --- a/src/wlist/word.rs +++ b/src/wlist/word.rs @@ -11,12 +11,13 @@ pub type Frequency = f64; pub type Word = String; pub type WordData = (Word, Frequency); pub type WordDataRef<'wl> = (&'wl Word, &'wl Frequency); +pub(crate) type WordMapInner = HashMap; #[derive(Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct WordMap { #[serde(flatten)] - inner: HashMap, + inner: WordMapInner, } impl Default for WordMap { @@ -71,6 +72,9 @@ impl WordMap { pub fn inner(&self) -> &HashMap { &self.inner } + pub fn inner_mut(&mut self) -> &mut HashMap { + &mut self.inner + } pub fn get(&self, word: I) -> Option { self.inner .get(&word.to_string()) @@ -82,6 +86,14 @@ impl WordMap { abs.into_iter().map(|p| (p.0, p.1 as f64 / n)).collect(); relative.into() } + pub fn only_words_with_len(&mut self, len: usize) { + self.inner = self + .inner + .iter() + .filter(|a| a.0.len() == len) + .map(|a| (a.0.to_owned(), *a.1)) + .collect::(); + } } impl std::fmt::Debug for WordMap {