refactor(wordlist): filter out all words not of specified length

This commit is contained in:
Christoph J. Scherr 2024-07-25 17:36:35 +02:00
parent 8f96f6662a
commit 47c2c9e892
3 changed files with 60 additions and 34 deletions

View File

@ -5,7 +5,7 @@ use serde_json;
use crate::error::WordlistError; use crate::error::WordlistError;
use super::{Word, WordList}; use super::{Word, WordList, WordMapInner};
pub const RAW_WORDLIST_BUNDLED_ENGLISH: &str = pub const RAW_WORDLIST_BUNDLED_ENGLISH: &str =
include_str!("../../data/wordlists/en_US_3b1b_freq_map.json"); include_str!("../../data/wordlists/en_US_3b1b_freq_map.json");
@ -18,6 +18,7 @@ pub const RAW_WORDLIST_PATH_GERMAN_SMALL: &str = "../../data/wordlists/german_SU
#[derive(Clone)] #[derive(Clone)]
pub struct BuiltinWList { pub struct BuiltinWList {
words: super::WordMap, words: super::WordMap,
name: String,
} }
impl BuiltinWList { impl BuiltinWList {
@ -33,31 +34,48 @@ impl BuiltinWList {
/// ///
/// Where the number is the frequency. Higher/Lower case is ignored. /// Where the number is the frequency. Higher/Lower case is ignored.
/// ///
/// Only words with the specified length will be included.
///
/// ## Errors /// ## Errors
/// ///
/// Will fail if the file path cannot be read or the format is wrong. /// Will fail if the file path cannot be read or the format is wrong.
pub fn load<P: AsRef<std::path::Path>>(wl_path: P) -> Result<Self, WordlistError> { pub fn load<P: AsRef<std::path::Path>>(wl_path: P, len: usize) -> Result<Self, WordlistError> {
let path: &Path = wl_path.as_ref(); let path: &Path = wl_path.as_ref();
let file = std::fs::File::open(path)?; let file = std::fs::File::open(path)?;
// don't load the whole string into memory // don't load the whole string into memory
let reader = std::io::BufReader::new(file); let reader = std::io::BufReader::new(file);
let words: super::WordMap = serde_json::from_reader(reader)?; let mut words: super::WordMap = serde_json::from_reader(reader)?;
words.only_words_with_len(len);
Ok(Self { words }) let name: String = if let Some(osstr) = path.file_name() {
osstr.to_str().unwrap_or("(no name)").to_string()
} else {
"(no name)".to_string()
};
Ok(Self { words, name })
} }
pub fn english() -> Self { pub fn english(len: usize) -> Self {
let words: super::WordMap = serde_json::from_str(RAW_WORDLIST_BUNDLED_ENGLISH).unwrap(); let mut words: super::WordMap = serde_json::from_str(RAW_WORDLIST_BUNDLED_ENGLISH).unwrap();
words.only_words_with_len(len);
Self { words } Self {
words,
name: "(builtin english)".to_string(),
}
} }
pub fn german() -> Self { pub fn german(len: usize) -> Self {
let words: super::WordMap = let mut words: super::WordMap =
serde_json::from_str(RAW_WORDLIST_BUNDLED_GERMAN_SMALL).unwrap(); serde_json::from_str(RAW_WORDLIST_BUNDLED_GERMAN_SMALL).unwrap();
words.only_words_with_len(len);
Self { words } Self {
words,
name: "(builtin german)".to_string(),
}
} }
} }
@ -75,36 +93,27 @@ impl super::WordList for BuiltinWList {
impl Default for BuiltinWList { impl Default for BuiltinWList {
fn default() -> Self { fn default() -> Self {
Self::english() Self::english(5)
} }
} }
impl Debug for BuiltinWList { impl Debug for BuiltinWList {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write( f.debug_struct("BuiltinWList")
f, .field("name", &self.name)
format_args!( .field("words", &self.words)
"BuiltinWList {{ \n\ .finish()
\tamount: {}, \n\
\ttotal_freq: {}, \n\
\tcommon: {}, \n\
\tthreshold: {}, \n\
\tfreq_range: {:?}, \n\
\tover_threshold: {:#?}, \n\
}}",
self.amount(),
self.total_freq(),
self.wordmap().n_common(),
self.wordmap().threshold(),
self.wordmap().freq_range(),
self.over_threashold()
),
)
} }
} }
impl Display for BuiltinWList { impl Display for BuiltinWList {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{self:#?}") writeln!(
f,
"{}:\nwords:\t{}\ntop 5:\t{:?}",
self.name,
self.len(),
self.n_most_likely(5)
)
} }
} }

View File

@ -11,7 +11,7 @@ pub mod builtin;
pub mod word; pub mod word;
use word::*; use word::*;
use crate::error::{Error, WResult, WordlistError}; use crate::error::{WResult, WordlistError};
pub type AnyWordlist = Box<dyn WordList>; pub type AnyWordlist = Box<dyn WordList>;
@ -35,9 +35,14 @@ pub trait WordList: Clone + std::fmt::Debug + Default + Sync + Display {
(w.0.clone(), *w.1) (w.0.clone(), *w.1)
} }
fn length_range(&self) -> impl RangeBounds<usize>; fn length_range(&self) -> impl RangeBounds<usize>;
fn amount(&self) -> usize { #[must_use]
fn len(&self) -> usize {
self.solutions().len() self.solutions().len()
} }
#[must_use]
fn is_empty(&self) -> bool {
self.solutions().len() == 0
}
fn wordmap(&self) -> &WordMap; fn wordmap(&self) -> &WordMap;
fn total_freq(&self) -> Frequency { fn total_freq(&self) -> Frequency {
self.wordmap().values().map(|a| a.to_owned()).sum() self.wordmap().values().map(|a| a.to_owned()).sum()

View File

@ -11,12 +11,13 @@ pub type Frequency = f64;
pub type Word = String; pub type Word = String;
pub type WordData = (Word, Frequency); pub type WordData = (Word, Frequency);
pub type WordDataRef<'wl> = (&'wl Word, &'wl Frequency); pub type WordDataRef<'wl> = (&'wl Word, &'wl Frequency);
pub(crate) type WordMapInner = HashMap<Word, Frequency>;
#[derive(Clone)] #[derive(Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct WordMap { pub struct WordMap {
#[serde(flatten)] #[serde(flatten)]
inner: HashMap<Word, Frequency>, inner: WordMapInner,
} }
impl Default for WordMap { impl Default for WordMap {
@ -71,6 +72,9 @@ impl WordMap {
pub fn inner(&self) -> &HashMap<Word, Frequency> { pub fn inner(&self) -> &HashMap<Word, Frequency> {
&self.inner &self.inner
} }
pub fn inner_mut(&mut self) -> &mut HashMap<Word, Frequency> {
&mut self.inner
}
pub fn get<I: std::fmt::Display>(&self, word: I) -> Option<WordData> { pub fn get<I: std::fmt::Display>(&self, word: I) -> Option<WordData> {
self.inner self.inner
.get(&word.to_string()) .get(&word.to_string())
@ -82,6 +86,14 @@ impl WordMap {
abs.into_iter().map(|p| (p.0, p.1 as f64 / n)).collect(); abs.into_iter().map(|p| (p.0, p.1 as f64 / n)).collect();
relative.into() relative.into()
} }
pub fn only_words_with_len(&mut self, len: usize) {
self.inner = self
.inner
.iter()
.filter(|a| a.0.len() == len)
.map(|a| (a.0.to_owned(), *a.1))
.collect::<WordMapInner>();
}
} }
impl std::fmt::Debug for WordMap { impl std::fmt::Debug for WordMap {