From 92911809363aa463e28d112084b0d7e745802da6 Mon Sep 17 00:00:00 2001 From: Walter Oggioni Date: Mon, 18 Dec 2023 11:40:03 +0800 Subject: [PATCH] module refactor --- Cargo.toml | 17 +-- examples/benchmark.rs | 59 ++++++++ examples/levtree_example.rs | 50 ------- src/keychecker.rs | 26 ++++ src/levtree/keychecker.rs | 27 ---- src/levtree/levtrie.rs | 191 -------------------------- src/levtree/result.rs | 59 -------- src/levtree/trie.rs | 243 -------------------------------- src/levtrie.rs | 196 ++++++++++++++++++++++++++ src/lib.rs | 25 ++++ src/{levtree => }/mod.rs | 0 src/result.rs | 50 +++++++ src/trie.rs | 251 ++++++++++++++++++++++++++++++++++ src/{levtree => }/trienode.rs | 37 +++-- 14 files changed, 640 insertions(+), 591 deletions(-) create mode 100644 examples/benchmark.rs delete mode 100644 examples/levtree_example.rs create mode 100644 src/keychecker.rs delete mode 100644 src/levtree/keychecker.rs delete mode 100644 src/levtree/levtrie.rs delete mode 100644 src/levtree/result.rs delete mode 100644 src/levtree/trie.rs create mode 100644 src/levtrie.rs create mode 100644 src/lib.rs rename src/{levtree => }/mod.rs (100%) create mode 100644 src/result.rs create mode 100644 src/trie.rs rename src/{levtree => }/trienode.rs (52%) diff --git a/Cargo.toml b/Cargo.toml index a53a7bc..8a4a2ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,17 +13,14 @@ sealed = "0.5" name = "levtree" crate-type = ["lib"] bench = false -path = "src/levtree/mod.rs" -[[bin]] -name = "linked_list" -path = "src/linked_list.rs" [[example]] -name = "levtree_example" -path = "examples/levtree_example.rs" - -[[bin]] -name = "test" -path = "src/test.rs" +name = "levtree_benchmark" +path = "examples/benchmark.rs" +[profile.release] +strip = true +lto = true +debug-assertions = false +codegen-units = 1 \ No newline at end of file diff --git a/examples/benchmark.rs b/examples/benchmark.rs new file mode 100644 index 0000000..e52fd66 --- /dev/null +++ b/examples/benchmark.rs @@ -0,0 +1,59 @@ +extern crate levtree; + + +use levtree::CaseSensitiveKeyChecker; +use levtree::CaseSensitiveLevTrie; +use levtree::DamerauLevenshteinDistanceCalculator; +use levtree::LevTrie; +use levtree::Trie; + +use std::io::BufRead; +use std::io::BufReader; + +trait IntoCharSlice { + fn into_char_slice(&self) -> Vec; +} + +impl IntoCharSlice for str { + fn into_char_slice(&self) -> Vec { + self.chars().into_iter().collect::>() + } +} + +fn main() { + let bytes = include_bytes!("cracklib-small"); + let reader = BufReader::new(&bytes[..]); + let mut trie: CaseSensitiveLevTrie = LevTrie::new(); + reader + .lines() + .map(|line| line.unwrap()) + .for_each(|word: String| { + trie.add(word.chars()); + }); + + let keys = [ + "camel", + "coriolis", + "mattel", + "cruzer", + "cpoper", + "roublesoot", + ]; + + for _ in 0..50 { + for key in keys { + let word = &key.into_char_slice()[..]; + trie.fuzzy_search::(word, 6); + } + } + + for key in keys { + let word = &key.into_char_slice()[..]; + let results = trie.fuzzy_search::(word, 6); + for result in results { + let word: String = trie.lineal_descendant(result.word).into_iter().collect(); + println!("distance: {}, wordkey: {}", result.distance, word); + } + println!("") + } +} diff --git a/examples/levtree_example.rs b/examples/levtree_example.rs deleted file mode 100644 index 214b710..0000000 --- a/examples/levtree_example.rs +++ /dev/null @@ -1,50 +0,0 @@ -extern crate levtree; - -use levtree::LevTrie; -use levtree::Trie; -use levtree::CaseSensitiveKeyChecker; -use levtree::CaseSensitiveLevTrie; -use levtree::DamerauLevenshteinDistanceCalculator; - -use std::io::BufReader; -use std::io::BufRead; - -trait IntoCharSlice { - fn into_char_slice(&self) -> Vec; -} - -impl IntoCharSlice for str { - fn into_char_slice(&self) -> Vec { - self.chars().into_iter().collect::>() - } -} - -fn main() { - let bytes = include_bytes!("cracklib-small"); - let reader = BufReader::new(&bytes[..]); - let mut trie : CaseSensitiveLevTrie = LevTrie::new(); - reader.lines() - .map(|line| line.unwrap()) - .for_each(|word : String| { - trie.add(word.chars()); - }); - - let keys = ["camel", "coriolis", "mattel", "cruzer", "cpoper", "roublesoot"]; - - for _ in 0..50 { - for key in keys { - let word = &key.into_char_slice()[..]; - trie.fuzzy_search::(word, 6); - } - } - - for key in keys { - let word = &key.into_char_slice()[..]; - let results = trie.fuzzy_search::(word, 6); - for result in results { - let word : String = trie.lineal_descendant(result.word).into_iter().collect(); - println!("distance: {}, wordkey: {}", result.distance, word); - } - println!("") - } -} diff --git a/src/keychecker.rs b/src/keychecker.rs new file mode 100644 index 0000000..2fad8de --- /dev/null +++ b/src/keychecker.rs @@ -0,0 +1,26 @@ +use super::trienode::TrieKey; + +pub trait KeyChecker +where + KEY: TrieKey, +{ + fn check(k1: Option, k2: Option) -> bool; +} + +pub struct CaseInsensitiveKeyChecker {} + +impl KeyChecker for CaseInsensitiveKeyChecker { + fn check(k1: Option, k2: Option) -> bool { + k1.zip(k2) + .map(|(v1, v2)| v1.to_lowercase().next() == v2.to_lowercase().next()) + .unwrap_or_else(|| k1 == k2) + } +} + +pub struct CaseSensitiveKeyChecker {} + +impl KeyChecker for CaseSensitiveKeyChecker { + fn check(k1: Option, k2: Option) -> bool { + k1 == k2 + } +} diff --git a/src/levtree/keychecker.rs b/src/levtree/keychecker.rs deleted file mode 100644 index 6f68ae9..0000000 --- a/src/levtree/keychecker.rs +++ /dev/null @@ -1,27 +0,0 @@ -use super::trienode::TrieKey; - -pub trait KeyChecker - where KEY : TrieKey { - fn check(k1 : Option, k2 : Option) -> bool; -} - -pub struct CaseInsensitiveKeyChecker {} - -impl KeyChecker for CaseInsensitiveKeyChecker { - fn check(k1 : Option, k2 : Option) -> bool { - k1.zip(k2) - .map(| (v1, v2) | { - v1.to_lowercase().next() == v2.to_lowercase().next() - }) - .unwrap_or_else(|| { k1 == k2} ) - } -} - -pub struct CaseSensitiveKeyChecker {} - -impl KeyChecker for CaseSensitiveKeyChecker { - fn check(k1 : Option, k2 : Option) -> bool { - k1 == k2 - } -} - diff --git a/src/levtree/levtrie.rs b/src/levtree/levtrie.rs deleted file mode 100644 index 162fad5..0000000 --- a/src/levtree/levtrie.rs +++ /dev/null @@ -1,191 +0,0 @@ -extern crate sealed; -use std::collections::BTreeSet; -use self::sealed::sealed; - -use super::trie::Trie; -use super::trie::VisitOutcome; -use super::trienode::TrieNode; -use super::trienode::TrieKey; -use super::keychecker::KeyChecker; -use super::result::Result; - -pub type LevTrie = Trie; -pub type LevTrieNode = TrieNode; - - -#[sealed] -pub trait DistanceCalculator - where KEY : TrieKey, - KEYCHECKER : KeyChecker, -{ - fn compute<>( - workspace: &mut Vec>, - nodes: &Vec>, - stack: &Vec, - wordkey : &[KEY], - worst_case : Option) -> VisitOutcome; -} - -impl LevTrie -where KEY : TrieKey, KEYCHECKER : KeyChecker { - pub fn new() -> LevTrie { - Trie::empty(|| {}) - } - - pub fn from_words(wordlist : U) -> LevTrie - where T : IntoIterator, U : IntoIterator - { - let mut result = LevTrie::new(); - for word in wordlist { - result.add(word); - } - result - } - - pub fn fuzzy_search(&mut self, word: &[KEY], max_result: usize) -> BTreeSet - where DC : DistanceCalculator { - let word_len = word.into_iter().count(); - let mut workspace : &mut Vec> = &mut (0..self.nodes()).map(|_| { Vec::new() }).collect(); - let mut results = BTreeSet::new(); - let required_size= word_len + 1; - let visit_pre = |stack : &Vec| -> VisitOutcome { - let stack_size = stack.len(); - let current_node_id = *stack.last().unwrap(); - let payload = &mut workspace[current_node_id]; - payload.resize(required_size, usize::default()); - if stack_size == 1 { - for i in 0..required_size { - payload[i] = i; - } - } else { - for i in 0..required_size { - payload[i] = if i == 0 { - stack_size - 1 - } else { - 0 - } - } - } - if stack_size > 1 { - let current_node = &mut self.get_node(current_node_id); - if current_node.key.is_none() { - let distance = workspace[stack[stack_size - 2]][word_len]; - results.insert(Result { - distance: distance, - word: current_node_id - }); - if results.len() > max_result { - results.pop_last(); - } - VisitOutcome::Skip - } else { - let worst_case = results.last() - .filter(|_| { results.len() == max_result }) - .map(|it| { it.distance }); - DC::compute(&mut workspace, &self.nodes, stack, word, worst_case) - } - } else { - VisitOutcome::Continue - } - }; - let visit_post = |_ : &Vec| { - }; - self.walk(visit_pre, visit_post); - results - } -} - - -pub struct LevenshteinDistanceCalculator {} - -#[sealed] -impl DistanceCalculator for LevenshteinDistanceCalculator - where - KEY : TrieKey, - KEYCHECKER : KeyChecker -{ - - fn compute<>( - workspace: &mut Vec>, - nodes: &Vec>, - stack: &Vec, - wordkey : &[KEY], - worst_case : Option) -> VisitOutcome { - let sz = stack.len(); - let key_size = wordkey.into_iter().count(); - for i in 1..=key_size { - if KEYCHECKER::check(Some(wordkey[i - 1]), nodes[stack[sz - 1]].key) { - workspace[stack[sz - 1]][i] = workspace[stack[sz - 2]][i - 1]; - } else { - workspace[stack[sz - 1]][i] = std::cmp::min( - std::cmp::min( - workspace[stack[sz - 1]][i - 1], - workspace[stack[sz - 2]][i -1] - ), - workspace[stack[sz - 2]][i] - ) + 1; - } - } - let condition = worst_case.map( - |wv| { - wv <= *workspace[stack[sz - 1]][..].into_iter().min().unwrap() - }).unwrap_or(false); - if condition { - VisitOutcome::Skip - } else { - VisitOutcome::Continue - } - } -} - -pub struct DamerauLevenshteinDistanceCalculator {} - -#[sealed] -impl DistanceCalculator for DamerauLevenshteinDistanceCalculator - where - KEY : TrieKey, - KEYCHECKER : KeyChecker -{ - - fn compute<>( - workspace: &mut Vec>, - nodes: &Vec>, - stack: &Vec, - wordkey : &[KEY], - worst_case : Option) -> VisitOutcome { - let sz = stack.len(); - let key_size = wordkey.into_iter().count(); - for i in 1..=key_size { - if KEYCHECKER::check(Some(wordkey[i - 1]), - stack.last().and_then(|it| {nodes[*it].key})) { - workspace[stack[sz - 1]][i] = workspace[stack[sz - 2]][i - 1]; - } else { - workspace[stack[sz - 1]][i] = std::cmp::min( - std::cmp::min( - workspace[stack[sz - 1]][i - 1], - workspace[stack[sz - 2]][i - 1] - ), - workspace[stack[sz - 2]][i] - ) + 1; - } - if sz > 2 && - i > 1 && - KEYCHECKER::check(Some(wordkey[i - 2]), nodes[stack[sz - 1]].key) && - KEYCHECKER::check(Some(wordkey[i - 1]), nodes[stack[sz - 2]].key) { - workspace[stack[sz - 1]][i] = std::cmp::min( - workspace[stack[sz - 1]][i], - workspace[stack[sz - 3]][i - 2] + 1, - ); - } - } - let condition = worst_case.map( - |wv| { - wv <= *workspace[stack[sz - 2]][..].into_iter().min().unwrap() - }).unwrap_or(false); - if condition { - VisitOutcome::Skip - } else { - VisitOutcome::Continue - } - } -} \ No newline at end of file diff --git a/src/levtree/result.rs b/src/levtree/result.rs deleted file mode 100644 index b50b71e..0000000 --- a/src/levtree/result.rs +++ /dev/null @@ -1,59 +0,0 @@ -use std::cmp::Ordering; - -pub struct Result { - pub word: usize, - pub distance: usize, -} - -impl PartialOrd for Result { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.distance.cmp(&other.distance)) - .filter(|it| { it != &Ordering::Equal}) - .or_else(|| { Some(self.word.cmp(&other.word)) }) - } -} - -impl PartialEq for Result { - fn eq(&self, other: &Self) -> bool { - self.distance == other.distance && self.word == other.word - } -} - -impl Eq for Result { -} - -impl Ord for Result { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - match self.distance.cmp(&other.distance) { - std::cmp::Ordering::Equal => { - self.word.cmp(&other.word) - } - std::cmp::Ordering::Greater => { - std::cmp::Ordering::Greater - } - std::cmp::Ordering::Less => { - std::cmp::Ordering::Less - } - } - } -} - - - -//struct Standing { -// size: usize, -// results: Vec, -//} -// -//impl Standing { -// pub fn new(size: usize) -> Standing { -// Standing { -// size, -// results: BTreeSet::new(), -// } -// } -// -// pub fn addResult(&mut self, res: Result) { -// self.results.push(res) -// } -//} \ No newline at end of file diff --git a/src/levtree/trie.rs b/src/levtree/trie.rs deleted file mode 100644 index 327829d..0000000 --- a/src/levtree/trie.rs +++ /dev/null @@ -1,243 +0,0 @@ -use std::collections::BTreeSet; -use std::collections::HashMap; -use std::marker::PhantomData; - -use super::trienode::TrieNode; -use super::trienode::TrieKey; -use super::levtrie::DistanceCalculator; -use super::keychecker::KeyChecker; -use super::result::Result; - -pub enum VisitOutcome { - Continue, - Skip, - EarlyExit, -} - -pub struct Trie -where KEY : TrieKey, KEYCHECKER : KeyChecker { - pub nodes : Vec>, - payload_initializer : fn() -> PAYLOAD, - tails : BTreeSet, - checker : PhantomData -} - -impl Trie -where KEY : TrieKey, KEYCHECKER : KeyChecker { - pub fn empty(initializer : fn() -> PAYLOAD) -> Trie { - Trie { - nodes: vec!(TrieNode::new0(None, initializer)), - payload_initializer: initializer, - tails : BTreeSet::new(), - checker : PhantomData::default() - } - } - - pub fn trie_from_words(initializer : fn() -> PAYLOAD, wordlist : U) -> Trie - where T : IntoIterator, U : IntoIterator - { - let mut result = Trie::empty(initializer); - for word in wordlist { - result.add(word); - } - result - } - - pub fn get_node_mut(&mut self, index : usize) -> &mut TrieNode { - &mut self.nodes[index] - } - - pub fn get_node(&self, index : usize) -> &TrieNode { - &self.nodes[index] - } - - pub fn nodes(&self) -> usize { - self.nodes.len() - } - - fn add_node(&mut self, key : Option, parent : usize, prev : Option) -> usize { - let mut result = TrieNode::new0(key, self.payload_initializer); - let result_index = self.nodes(); - result.parent = Some(parent); - match prev { - Some(prev_node) => { - self.get_node_mut(prev_node).next = Some(result_index); - result.prev = prev; - } - None => { - let parent_node = self.get_node_mut(parent); - match parent_node.child { - None => { - parent_node.child = Some(result_index); - } - Some(parent_child) => { - let mut node = parent_child; - loop { - let next = self.get_node(node).next; - match next { - Some(next_node) => { - node = next_node; - } - None => { - break; - } - } - } - self.get_node_mut(node).next = Some(result_index); - result.prev = Some(node) - } - } - } - } - self.nodes.push(result); - result_index - } - - pub fn add(&mut self, path : T) -> (bool, usize) - where T: IntoIterator - { - let mut result = false; - let mut pnode = 0; - 'wordLoop: - for key in path { - let mut cnode = self.get_node(pnode).child; - loop { - match cnode { - Some(cnode_index) => { - let cnode_node = self.get_node(cnode_index); - if KEYCHECKER::check(cnode_node.key, Some(key)) { - pnode = cnode_index; - continue 'wordLoop; - } else if self.get_node(cnode_index).next.is_none() { - break; - } else { - cnode = self.get_node(cnode_index).next; - } - } - None => { - break; - } - } - } - pnode = self.add_node(Some(key), pnode, cnode); - result = true; - } - if result { - let tail = self.add_node(None, pnode, None); - self.tails.insert(tail); - let mut node = Some(tail); - loop { - match node { - Some(n) => { - let current_node = self.get_node_mut(n); - current_node.ref_count += 1; - node = current_node.parent; - } - None => { - break; - } - } - } - (true, tail) - } else { - (false, pnode) - } - } - - pub fn search(&mut self, path : Vec) -> Option - { - let mut result : Option = None; - let visit_pre = |stack : &Vec| -> VisitOutcome { - if stack.len() == 1 { - VisitOutcome::Continue - } else { - let last = *stack.last().expect(""); - let index= stack.len() - 2; - let node = self.get_node(last); - if index < path.len() { - if KEYCHECKER::check(node.key, Some(path[index])) { - VisitOutcome::Continue - } else { - VisitOutcome::Skip - } - } else { - if node.key.is_none() { - result = Some(last); - } - VisitOutcome::EarlyExit - } - } - }; - let visit_post = |stack : &Vec| {}; - self.walk(visit_pre, visit_post); - result - } - - pub fn lineal_descendant(&self, start : usize) -> Vec<&KEY> { - let mut chars : Vec<&KEY> = vec!(); - let mut node_option = Some(start); - loop { - match node_option { - Some(node) => { - let key = &self.get_node(node).key; - match key { - Some(key) => { - chars.push(key); - } - None => { - } - } - node_option = self.get_node(node).parent; - } - None => { - break; - } - } - } - chars.reverse(); - chars - } - - pub fn walk(&self, mut visit_pre : CB1, mut visit_post : CB2) - where CB1: FnMut(&Vec) -> VisitOutcome, - CB2: FnMut(&Vec) { - let mut stack : Vec<(usize, Option)> = vec!(); - let mut public_stack : Vec = vec!(); - let root_node = self.get_node(0); - stack.push((0, root_node.child)); - public_stack.push(0); - visit_pre(&public_stack); - while !stack.is_empty() { - let last = &mut stack.last_mut().unwrap(); - match last.1 { - Some(child_node_id) => { - let child_node = self.get_node(child_node_id); - last.1 = child_node.next; - public_stack.push(child_node_id); - let visit_pre_outcome = visit_pre(&public_stack); - match visit_pre_outcome { - VisitOutcome::Continue => { - stack.push((child_node_id, child_node.child)); - } - VisitOutcome::Skip => { - stack.push((child_node_id, None)); - } - VisitOutcome::EarlyExit => { - return - } - } - } - None => { - visit_post(&public_stack); - stack.pop(); - public_stack.pop(); - } - } - } - } - - pub fn tails(&self) -> &BTreeSet { - &self.tails - } -} - diff --git a/src/levtrie.rs b/src/levtrie.rs new file mode 100644 index 0000000..e2b8abd --- /dev/null +++ b/src/levtrie.rs @@ -0,0 +1,196 @@ +extern crate sealed; +use self::sealed::sealed; +use std::collections::BTreeSet; + +use super::keychecker::KeyChecker; +use super::result::Result; +use super::trie::Trie; +use super::trie::VisitOutcome; +use super::trienode::TrieKey; +use super::trienode::TrieNode; + +pub type LevTrie = Trie; +pub type LevTrieNode = TrieNode; + +#[sealed] +pub trait DistanceCalculator +where + KEY: TrieKey, + KEYCHECKER: KeyChecker, +{ + fn compute( + workspace: &mut Vec>, + nodes: &Vec>, + stack: &Vec, + wordkey: &[KEY], + worst_case: Option, + ) -> VisitOutcome; +} + +impl LevTrie +where + KEY: TrieKey, + KEYCHECKER: KeyChecker, +{ + pub fn new() -> LevTrie { + Trie::empty(|| {}) + } + + pub fn from_words(wordlist: U) -> LevTrie + where + T: IntoIterator, + U: IntoIterator, + { + let mut result = LevTrie::new(); + for word in wordlist { + result.add(word); + } + result + } + + pub fn fuzzy_search(&mut self, word: &[KEY], max_result: usize) -> BTreeSet + where + DC: DistanceCalculator, + { + let word_len = word.into_iter().count(); + let mut workspace: &mut Vec> = + &mut (0..self.nodes()).map(|_| Vec::new()).collect(); + let mut results = BTreeSet::new(); + let required_size = word_len + 1; + let visit_pre = |stack: &Vec| -> VisitOutcome { + let stack_size = stack.len(); + let current_node_id = *stack.last().unwrap(); + let payload = &mut workspace[current_node_id]; + payload.resize(required_size, usize::default()); + if stack_size == 1 { + for i in 0..required_size { + payload[i] = i; + } + } else { + for i in 0..required_size { + payload[i] = if i == 0 { stack_size - 1 } else { 0 } + } + } + if stack_size > 1 { + let current_node = &mut self.get_node(current_node_id); + if current_node.key.is_none() { + let distance = workspace[stack[stack_size - 2]][word_len]; + results.insert(Result { + distance: distance, + word: current_node_id, + }); + if results.len() > max_result { + results.pop_last(); + } + VisitOutcome::Skip + } else { + let worst_case = results + .last() + .filter(|_| results.len() == max_result) + .map(|it| it.distance); + DC::compute(&mut workspace, &self.nodes, stack, word, worst_case) + } + } else { + VisitOutcome::Continue + } + }; + let visit_post = |_: &Vec| {}; + self.walk(visit_pre, visit_post); + results + } +} + +pub struct LevenshteinDistanceCalculator {} + +#[sealed] +impl DistanceCalculator for LevenshteinDistanceCalculator +where + KEY: TrieKey, + KEYCHECKER: KeyChecker, +{ + fn compute( + workspace: &mut Vec>, + nodes: &Vec>, + stack: &Vec, + wordkey: &[KEY], + worst_case: Option, + ) -> VisitOutcome { + let sz = stack.len(); + let key_size = wordkey.into_iter().count(); + for i in 1..=key_size { + if KEYCHECKER::check(Some(wordkey[i - 1]), nodes[stack[sz - 1]].key) { + workspace[stack[sz - 1]][i] = workspace[stack[sz - 2]][i - 1]; + } else { + workspace[stack[sz - 1]][i] = std::cmp::min( + std::cmp::min( + workspace[stack[sz - 1]][i - 1], + workspace[stack[sz - 2]][i - 1], + ), + workspace[stack[sz - 2]][i], + ) + 1; + } + } + let condition = worst_case + .map(|wv| wv <= *workspace[stack[sz - 1]][..].into_iter().min().unwrap()) + .unwrap_or(false); + if condition { + VisitOutcome::Skip + } else { + VisitOutcome::Continue + } + } +} + +pub struct DamerauLevenshteinDistanceCalculator {} + +#[sealed] +impl DistanceCalculator for DamerauLevenshteinDistanceCalculator +where + KEY: TrieKey, + KEYCHECKER: KeyChecker, +{ + fn compute( + workspace: &mut Vec>, + nodes: &Vec>, + stack: &Vec, + wordkey: &[KEY], + worst_case: Option, + ) -> VisitOutcome { + let sz = stack.len(); + let key_size = wordkey.into_iter().count(); + for i in 1..=key_size { + if KEYCHECKER::check( + Some(wordkey[i - 1]), + stack.last().and_then(|it| nodes[*it].key), + ) { + workspace[stack[sz - 1]][i] = workspace[stack[sz - 2]][i - 1]; + } else { + workspace[stack[sz - 1]][i] = std::cmp::min( + std::cmp::min( + workspace[stack[sz - 1]][i - 1], + workspace[stack[sz - 2]][i - 1], + ), + workspace[stack[sz - 2]][i], + ) + 1; + } + if sz > 2 + && i > 1 + && KEYCHECKER::check(Some(wordkey[i - 2]), nodes[stack[sz - 1]].key) + && KEYCHECKER::check(Some(wordkey[i - 1]), nodes[stack[sz - 2]].key) + { + workspace[stack[sz - 1]][i] = std::cmp::min( + workspace[stack[sz - 1]][i], + workspace[stack[sz - 3]][i - 2] + 1, + ); + } + } + let condition = worst_case + .map(|wv| wv <= *workspace[stack[sz - 2]][..].into_iter().min().unwrap()) + .unwrap_or(false); + if condition { + VisitOutcome::Skip + } else { + VisitOutcome::Continue + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..ae4e1e4 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,25 @@ +#[macro_use] +extern crate trait_group; +mod levtrie; +// pub use self::levtrie::LevTrieNode as LevTrieNode; +pub use self::levtrie::DamerauLevenshteinDistanceCalculator; +pub use self::levtrie::DistanceCalculator; +pub use self::levtrie::LevTrie; +pub use self::levtrie::LevenshteinDistanceCalculator; + +mod trie; +pub use self::trie::Trie; + +mod trienode; +//use self::trienode::TrieNode as TrieNode; + +mod keychecker; +pub use self::keychecker::CaseInsensitiveKeyChecker; +pub use self::keychecker::CaseSensitiveKeyChecker; +pub use self::keychecker::KeyChecker; + +pub type CaseSensitiveLevTrie = LevTrie; +pub type CaseInSensitiveLevTrie = LevTrie; + +mod result; +pub use self::result::Result; diff --git a/src/levtree/mod.rs b/src/mod.rs similarity index 100% rename from src/levtree/mod.rs rename to src/mod.rs diff --git a/src/result.rs b/src/result.rs new file mode 100644 index 0000000..d78795a --- /dev/null +++ b/src/result.rs @@ -0,0 +1,50 @@ +use std::cmp::Ordering; + +pub struct Result { + pub word: usize, + pub distance: usize, +} + +impl PartialOrd for Result { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.distance.cmp(&other.distance)) + .filter(|it| it != &Ordering::Equal) + .or_else(|| Some(self.word.cmp(&other.word))) + } +} + +impl PartialEq for Result { + fn eq(&self, other: &Self) -> bool { + self.distance == other.distance && self.word == other.word + } +} + +impl Eq for Result {} + +impl Ord for Result { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + match self.distance.cmp(&other.distance) { + std::cmp::Ordering::Equal => self.word.cmp(&other.word), + std::cmp::Ordering::Greater => std::cmp::Ordering::Greater, + std::cmp::Ordering::Less => std::cmp::Ordering::Less, + } + } +} + +//struct Standing { +// size: usize, +// results: Vec, +//} +// +//impl Standing { +// pub fn new(size: usize) -> Standing { +// Standing { +// size, +// results: BTreeSet::new(), +// } +// } +// +// pub fn addResult(&mut self, res: Result) { +// self.results.push(res) +// } +//} diff --git a/src/trie.rs b/src/trie.rs new file mode 100644 index 0000000..ffe7bbb --- /dev/null +++ b/src/trie.rs @@ -0,0 +1,251 @@ +use std::collections::BTreeSet; +use std::collections::HashMap; +use std::marker::PhantomData; + +use super::keychecker::KeyChecker; +use super::levtrie::DistanceCalculator; +use super::result::Result; +use super::trienode::TrieKey; +use super::trienode::TrieNode; + +pub enum VisitOutcome { + Continue, + Skip, + EarlyExit, +} + +pub struct Trie +where + KEY: TrieKey, + KEYCHECKER: KeyChecker, +{ + pub nodes: Vec>, + payload_initializer: fn() -> PAYLOAD, + tails: BTreeSet, + checker: PhantomData, +} + +impl Trie +where + KEY: TrieKey, + KEYCHECKER: KeyChecker, +{ + pub fn empty(initializer: fn() -> PAYLOAD) -> Trie { + Trie { + nodes: vec![TrieNode::new0(None, initializer)], + payload_initializer: initializer, + tails: BTreeSet::new(), + checker: PhantomData::default(), + } + } + + pub fn trie_from_words( + initializer: fn() -> PAYLOAD, + wordlist: U, + ) -> Trie + where + T: IntoIterator, + U: IntoIterator, + { + let mut result = Trie::empty(initializer); + for word in wordlist { + result.add(word); + } + result + } + + pub fn get_node_mut(&mut self, index: usize) -> &mut TrieNode { + &mut self.nodes[index] + } + + pub fn get_node(&self, index: usize) -> &TrieNode { + &self.nodes[index] + } + + pub fn nodes(&self) -> usize { + self.nodes.len() + } + + fn add_node(&mut self, key: Option, parent: usize, prev: Option) -> usize { + let mut result = TrieNode::new0(key, self.payload_initializer); + let result_index = self.nodes(); + result.parent = Some(parent); + match prev { + Some(prev_node) => { + self.get_node_mut(prev_node).next = Some(result_index); + result.prev = prev; + } + None => { + let parent_node = self.get_node_mut(parent); + match parent_node.child { + None => { + parent_node.child = Some(result_index); + } + Some(parent_child) => { + let mut node = parent_child; + loop { + let next = self.get_node(node).next; + match next { + Some(next_node) => { + node = next_node; + } + None => { + break; + } + } + } + self.get_node_mut(node).next = Some(result_index); + result.prev = Some(node) + } + } + } + } + self.nodes.push(result); + result_index + } + + pub fn add(&mut self, path: T) -> (bool, usize) + where + T: IntoIterator, + { + let mut result = false; + let mut pnode = 0; + 'wordLoop: for key in path { + let mut cnode = self.get_node(pnode).child; + loop { + match cnode { + Some(cnode_index) => { + let cnode_node = self.get_node(cnode_index); + if KEYCHECKER::check(cnode_node.key, Some(key)) { + pnode = cnode_index; + continue 'wordLoop; + } else if self.get_node(cnode_index).next.is_none() { + break; + } else { + cnode = self.get_node(cnode_index).next; + } + } + None => { + break; + } + } + } + pnode = self.add_node(Some(key), pnode, cnode); + result = true; + } + if result { + let tail = self.add_node(None, pnode, None); + self.tails.insert(tail); + let mut node = Some(tail); + loop { + match node { + Some(n) => { + let current_node = self.get_node_mut(n); + current_node.ref_count += 1; + node = current_node.parent; + } + None => { + break; + } + } + } + (true, tail) + } else { + (false, pnode) + } + } + + pub fn search(&mut self, path: Vec) -> Option { + let mut result: Option = None; + let visit_pre = |stack: &Vec| -> VisitOutcome { + if stack.len() == 1 { + VisitOutcome::Continue + } else { + let last = *stack.last().expect(""); + let index = stack.len() - 2; + let node = self.get_node(last); + if index < path.len() { + if KEYCHECKER::check(node.key, Some(path[index])) { + VisitOutcome::Continue + } else { + VisitOutcome::Skip + } + } else { + if node.key.is_none() { + result = Some(last); + } + VisitOutcome::EarlyExit + } + } + }; + let visit_post = |stack: &Vec| {}; + self.walk(visit_pre, visit_post); + result + } + + pub fn lineal_descendant(&self, start: usize) -> Vec<&KEY> { + let mut chars: Vec<&KEY> = vec![]; + let mut node_option = Some(start); + loop { + match node_option { + Some(node) => { + let key = &self.get_node(node).key; + match key { + Some(key) => { + chars.push(key); + } + None => {} + } + node_option = self.get_node(node).parent; + } + None => { + break; + } + } + } + chars.reverse(); + chars + } + + pub fn walk(&self, mut visit_pre: CB1, mut visit_post: CB2) + where + CB1: FnMut(&Vec) -> VisitOutcome, + CB2: FnMut(&Vec), + { + let mut stack: Vec<(usize, Option)> = vec![]; + let mut public_stack: Vec = vec![]; + let root_node = self.get_node(0); + stack.push((0, root_node.child)); + public_stack.push(0); + visit_pre(&public_stack); + while !stack.is_empty() { + let last = &mut stack.last_mut().unwrap(); + match last.1 { + Some(child_node_id) => { + let child_node = self.get_node(child_node_id); + last.1 = child_node.next; + public_stack.push(child_node_id); + let visit_pre_outcome = visit_pre(&public_stack); + match visit_pre_outcome { + VisitOutcome::Continue => { + stack.push((child_node_id, child_node.child)); + } + VisitOutcome::Skip => { + stack.push((child_node_id, None)); + } + VisitOutcome::EarlyExit => return, + } + } + None => { + visit_post(&public_stack); + stack.pop(); + public_stack.pop(); + } + } + } + } + + pub fn tails(&self) -> &BTreeSet { + &self.tails + } +} diff --git a/src/levtree/trienode.rs b/src/trienode.rs similarity index 52% rename from src/levtree/trienode.rs rename to src/trienode.rs index f93e365..494d29a 100644 --- a/src/levtree/trienode.rs +++ b/src/trienode.rs @@ -1,24 +1,37 @@ - trait_group! { pub trait TrieKey : std::marker::Copy + std::fmt::Display + Sized } // pub trait KeyPath : std::ops::Index + IntoIterator {} - -pub struct TrieNode where KEY : TrieKey { +pub struct TrieNode +where + KEY: TrieKey, +{ pub key: Option, pub payload: PAYLOAD, pub prev: Option, pub next: Option, pub child: Option, pub parent: Option, - pub ref_count: usize + pub ref_count: usize, } -impl TrieNode where KEY : TrieKey { - fn new(key: Option, payload_initializer : U, prev: Option, next: Option, parent: Option, child : Option) -> TrieNode - where U : Fn() -> PAYLOAD { +impl TrieNode +where + KEY: TrieKey, +{ + fn new( + key: Option, + payload_initializer: U, + prev: Option, + next: Option, + parent: Option, + child: Option, + ) -> TrieNode + where + U: Fn() -> PAYLOAD, + { TrieNode { key, payload: payload_initializer(), @@ -26,12 +39,14 @@ impl TrieNode where KEY : TrieKey { next: next, child: child, parent: parent, - ref_count: 0 + ref_count: 0, } } - pub fn new0(key: Option, payload_initializer : U) -> TrieNode - where U : Fn() -> PAYLOAD { + pub fn new0(key: Option, payload_initializer: U) -> TrieNode + where + U: Fn() -> PAYLOAD, + { TrieNode::new(key, payload_initializer, None, None, None, None) } -} \ No newline at end of file +}