Compare commits

..

6 Commits

Author SHA1 Message Date
woggioni 9e0abb48fb added LICENSE and cleaned source folder
CI / build (push) Failing after 6s
2025-07-30 12:53:09 +08:00
woggioni 3d2a08801f added basic unit tests 2025-07-30 12:47:58 +08:00
woggioni 575281869b removed public visibility of some library internals 2023-12-18 12:42:48 +08:00
woggioni 9291180936 module refactor 2023-12-18 11:40:03 +08:00
woggioni 9a522ccb07 initial working version 2023-12-04 22:19:37 +08:00
woggioni 2188ade9c7 initial commit 2019-12-28 16:50:04 +01:00
14 changed files with 55619 additions and 2 deletions
+18
View File
@@ -0,0 +1,18 @@
name: CI
on:
push:
branches: [ master ]
jobs:
build:
runs-on: woryzen
steps:
- name: Checkout sources
uses: actions/checkout@v4
- name: Run unit tests
run: |
cargo test
- name: Publish artifacts
env:
CARGO_REGISTRIES_GITEA_TOKEN: Bearer ${{ secrets.PUBLISHER_TOKEN }}
run: |
cargo publish
+2
View File
@@ -0,0 +1,2 @@
/target
**/*.rs.bk
Generated
+70
View File
@@ -0,0 +1,70 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "levtree"
version = "0.1.1"
dependencies = [
"sealed",
"trait-group",
]
[[package]]
name = "proc-macro2"
version = "1.0.70"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39278fbbf5fb4f646ce651690877f89d1c5811a3d4acb27700c1cb3cdb78fd3b"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
dependencies = [
"proc-macro2",
]
[[package]]
name = "sealed"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4a8caec23b7800fb97971a1c6ae365b6239aaeddfb934d6265f8505e795699d"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "syn"
version = "2.0.39"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "trait-group"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1b362975c6f0f21a41fbb9ca91fe5dcb7e01e12331360374347476b45f5cb9c"
[[package]]
name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+21
View File
@@ -0,0 +1,21 @@
[package]
name = "levtree"
version = "0.1.1"
authors = ["Walter Oggioni <oggioni.walter@gmail.com>"]
license = "MIT"
edition = "2024"
[dependencies]
trait-group = "0.1.0"
sealed = "0.5"
[lib]
name = "levtree"
crate-type = ["lib"]
bench = false
[[example]]
name = "levtree_benchmark"
path = "examples/benchmark.rs"
-2
View File
@@ -1,2 +0,0 @@
# rlevtree
+57
View File
@@ -0,0 +1,57 @@
extern crate levtree;
use levtree::CaseSensitiveLevTrie;
use levtree::DamerauLevenshteinDistanceCalculator;
use levtree::LevTrie;
use std::io::BufRead;
use std::io::BufReader;
trait IntoCharSlice {
fn into_char_slice(&self) -> Vec<char>;
}
impl IntoCharSlice for str {
fn into_char_slice(&self) -> Vec<char> {
self.chars().into_iter().collect::<Vec<_>>()
}
}
fn main() {
let bytes = include_bytes!("cracklib-small");
let reader = BufReader::new(&bytes[..]);
let mut trie: CaseSensitiveLevTrie = LevTrie::new();
reader
.lines()
.map(|line| line.unwrap())
.for_each(|word: String| {
trie.add(word.chars());
});
let keys = [
"camel",
"coriolis",
"mattel",
"cruzer",
"cpoper",
"roublesoot",
];
for _ in 0..50 {
for key in keys {
let word = &key.into_char_slice()[..];
trie.fuzzy_search::<DamerauLevenshteinDistanceCalculator>(word, 6);
}
}
for key in keys {
let word = &key.into_char_slice()[..];
let results = trie.fuzzy_search::<DamerauLevenshteinDistanceCalculator>(word, 6);
println!("needle: {}", key);
for result in results {
let word: String = trie.lineal_descendant(result.word).into_iter().collect();
println!("distance: {}, wordkey: {}", result.distance, word);
}
println!("")
}
}
File diff suppressed because it is too large Load Diff
+26
View File
@@ -0,0 +1,26 @@
use super::trienode::TrieKey;
pub trait KeyChecker<KEY>
where
KEY: TrieKey,
{
fn check(k1: Option<KEY>, k2: Option<KEY>) -> bool;
}
pub struct CaseInsensitiveKeyChecker {}
impl KeyChecker<char> for CaseInsensitiveKeyChecker {
fn check(k1: Option<char>, k2: Option<char>) -> bool {
k1.zip(k2)
.map(|(v1, v2)| v1.to_lowercase().next() == v2.to_lowercase().next())
.unwrap_or_else(|| k1 == k2)
}
}
pub struct CaseSensitiveKeyChecker {}
impl KeyChecker<char> for CaseSensitiveKeyChecker {
fn check(k1: Option<char>, k2: Option<char>) -> bool {
k1 == k2
}
}
+197
View File
@@ -0,0 +1,197 @@
extern crate sealed;
use self::sealed::sealed;
use std::collections::BinaryHeap;
use super::keychecker::KeyChecker;
use super::search_result::SearchResult;
use super::trie::Trie;
use super::trie::VisitOutcome;
use super::trienode::TrieKey;
use super::trienode::TrieNode;
pub type LevTrie<KEY, KEYCHECKER> = Trie<KEY, KEYCHECKER>;
pub type LevTrieNode<KEY> = TrieNode<KEY>;
#[sealed]
pub trait DistanceCalculator<KEY, KEYCHECKER>
where
KEY: TrieKey,
KEYCHECKER: KeyChecker<KEY>,
{
fn compute(
workspace: &mut Vec<Vec<usize>>,
nodes: &[LevTrieNode<KEY>],
stack: &[usize],
wordkey: &[KEY],
worst_case: Option<usize>,
) -> VisitOutcome;
}
impl<KEY, KEYCHECKER> LevTrie<KEY, KEYCHECKER>
where
KEY: TrieKey,
KEYCHECKER: KeyChecker<KEY>,
{
pub fn new() -> LevTrie<KEY, KEYCHECKER> {
Trie::default()
}
pub fn from_words<T, U>(wordlist: U) -> LevTrie<KEY, KEYCHECKER>
where
T: IntoIterator<Item = KEY>,
U: IntoIterator<Item = T>,
{
let mut result = LevTrie::new();
for word in wordlist {
result.add(word);
}
result
}
pub fn fuzzy_search<DC>(&mut self, word: &[KEY], max_result: usize) -> Vec<SearchResult>
where
DC: DistanceCalculator<KEY, KEYCHECKER>,
{
let word_len = word.len();
let workspace: &mut Vec<Vec<usize>> = &mut (0..self.nodes()).map(|_| Vec::new()).collect();
let mut result_heap = BinaryHeap::<SearchResult>::with_capacity(max_result + 1);
let required_size = word_len + 1;
let visit_pre = |stack: &Vec<usize>| -> VisitOutcome {
let stack_size = stack.len();
let current_node_id = *stack.last().unwrap();
let payload = &mut workspace[current_node_id];
payload.resize(required_size, usize::default());
if stack_size == 1 {
for (i, item) in payload.iter_mut().enumerate().take(required_size) {
*item = i;
}
} else {
for (i, item) in payload.iter_mut().enumerate().take(required_size) {
*item = if i == 0 { stack_size - 1 } else { 0 }
}
}
if stack_size > 1 {
let current_node = &mut self.get_node(current_node_id);
if current_node.key.is_none() {
let distance = workspace[stack[stack_size - 2]][word_len];
let search_result = SearchResult {
distance,
word: current_node_id,
};
result_heap.push(search_result);
if result_heap.len() > max_result {
result_heap.pop();
}
VisitOutcome::Skip
} else {
let worst_case = result_heap
.peek()
.filter(|_| result_heap.len() == max_result)
.map(|it| it.distance);
DC::compute(workspace, &self.nodes, stack, word, worst_case)
}
} else {
VisitOutcome::Continue
}
};
let visit_post = |_: &Vec<usize>| {};
self.walk(visit_pre, visit_post);
result_heap.into_sorted_vec()
}
}
pub struct LevenshteinDistanceCalculator {}
#[sealed]
impl<KEY, KEYCHECKER> DistanceCalculator<KEY, KEYCHECKER> for LevenshteinDistanceCalculator
where
KEY: TrieKey,
KEYCHECKER: KeyChecker<KEY>,
{
fn compute(
workspace: &mut Vec<Vec<usize>>,
nodes: &[LevTrieNode<KEY>],
stack: &[usize],
wordkey: &[KEY],
worst_case: Option<usize>,
) -> VisitOutcome {
let sz = stack.len();
let key_size = wordkey.len();
for i in 1..=key_size {
if KEYCHECKER::check(Some(wordkey[i - 1]), nodes[stack[sz - 1]].key) {
workspace[stack[sz - 1]][i] = workspace[stack[sz - 2]][i - 1];
} else {
workspace[stack[sz - 1]][i] = std::cmp::min(
std::cmp::min(
workspace[stack[sz - 1]][i - 1],
workspace[stack[sz - 2]][i - 1],
),
workspace[stack[sz - 2]][i],
) + 1;
}
}
let condition = worst_case
.map(|wv| wv <= *workspace[stack[sz - 1]][..].iter().min().unwrap())
.unwrap_or(false);
if condition {
VisitOutcome::Skip
} else {
VisitOutcome::Continue
}
}
}
pub struct DamerauLevenshteinDistanceCalculator {}
#[sealed]
impl<KEY, KEYCHECKER> DistanceCalculator<KEY, KEYCHECKER> for DamerauLevenshteinDistanceCalculator
where
KEY: TrieKey,
KEYCHECKER: KeyChecker<KEY>,
{
fn compute(
workspace: &mut Vec<Vec<usize>>,
nodes: &[LevTrieNode<KEY>],
stack: &[usize],
wordkey: &[KEY],
worst_case: Option<usize>,
) -> VisitOutcome {
let sz = stack.len();
let key_size = wordkey.len();
for i in 1..=key_size {
if KEYCHECKER::check(
Some(wordkey[i - 1]),
stack.last().and_then(|it| nodes[*it].key),
) {
workspace[stack[sz - 1]][i] = workspace[stack[sz - 2]][i - 1];
} else {
workspace[stack[sz - 1]][i] = std::cmp::min(
std::cmp::min(
workspace[stack[sz - 1]][i - 1],
workspace[stack[sz - 2]][i - 1],
),
workspace[stack[sz - 2]][i],
) + 1;
}
if sz > 2
&& i > 1
&& KEYCHECKER::check(Some(wordkey[i - 2]), nodes[stack[sz - 1]].key)
&& KEYCHECKER::check(Some(wordkey[i - 1]), nodes[stack[sz - 2]].key)
{
workspace[stack[sz - 1]][i] = std::cmp::min(
workspace[stack[sz - 1]][i],
workspace[stack[sz - 3]][i - 2] + 1,
);
}
}
let condition = worst_case
.map(|wv| wv <= *workspace[stack[sz - 2]][..].iter().min().unwrap())
.unwrap_or(false);
if condition {
VisitOutcome::Skip
} else {
VisitOutcome::Continue
}
}
}
+28
View File
@@ -0,0 +1,28 @@
#[macro_use]
extern crate trait_group;
mod levtrie;
// pub use self::levtrie::LevTrieNode as LevTrieNode;
pub use self::levtrie::DamerauLevenshteinDistanceCalculator;
pub use self::levtrie::DistanceCalculator;
pub use self::levtrie::LevTrie;
pub use self::levtrie::LevenshteinDistanceCalculator;
mod trie;
pub use self::trie::Trie;
mod trienode;
//use self::trienode::TrieNode as TrieNode;
mod keychecker;
pub use self::keychecker::CaseInsensitiveKeyChecker;
pub use self::keychecker::CaseSensitiveKeyChecker;
pub use self::keychecker::KeyChecker;
pub type CaseSensitiveLevTrie = LevTrie<char, CaseSensitiveKeyChecker>;
pub type CaseInSensitiveLevTrie = LevTrie<char, CaseInsensitiveKeyChecker>;
mod search_result;
pub use self::search_result::SearchResult;
#[cfg(test)]
mod tests;
+31
View File
@@ -0,0 +1,31 @@
use std::cmp::Ordering;
#[derive(Clone)]
pub struct SearchResult {
pub word: usize,
pub distance: usize,
}
impl PartialOrd for SearchResult {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for SearchResult {
fn eq(&self, other: &Self) -> bool {
self.distance == other.distance && self.word == other.word
}
}
impl Eq for SearchResult {}
impl Ord for SearchResult {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
match self.distance.cmp(&other.distance) {
std::cmp::Ordering::Equal => self.word.cmp(&other.word),
std::cmp::Ordering::Greater => std::cmp::Ordering::Greater,
std::cmp::Ordering::Less => std::cmp::Ordering::Less,
}
}
}
+138
View File
@@ -0,0 +1,138 @@
use super::{
CaseSensitiveLevTrie, DamerauLevenshteinDistanceCalculator, KeyChecker, LevTrie,
LevenshteinDistanceCalculator, SearchResult,
};
use std::collections::BTreeMap;
use std::fmt::Display;
use std::io::Write;
struct ExpectedResults {
data: Vec<(usize, usize)>,
}
impl ExpectedResults {
fn new(id_map: &BTreeMap<String, usize>, results: &[(String, usize)]) -> ExpectedResults {
let data = results
.iter()
.map(|(key, distance)| {
(
*id_map
.get(key)
.ok_or_else(|| format!("Id not found for key '{key}'"))
.unwrap(),
*distance,
)
})
.collect::<Vec<(usize, usize)>>();
ExpectedResults { data }
}
fn check(&self, search_results: &[SearchResult]) {
for i in 0..self.data.len() {
let SearchResult { word, distance } = search_results[i];
let data = self.data[i];
if data != (word, distance) {
panic!("({}, {}) <> ({}, {})", data.0, data.1, word, distance);
}
}
}
}
fn print_search_results<T: Display + Copy, C: KeyChecker<T>>(
trie: &LevTrie<T, C>,
search_results: &[SearchResult],
key_separator: &str,
) -> Result<(), std::io::Error> {
for result in search_results {
let mut word = Vec::<u8>::new();
for (i, fragment) in trie.lineal_descendant(result.word).enumerate() {
if i > 0 {
word.write(format!("{}{}", key_separator, fragment).as_bytes())?;
} else {
word.write(format!("{}", fragment).as_bytes())?;
}
}
println!(
"distance: {}, wordkey: {}, id: {}",
result.distance,
String::from_utf8(word).unwrap(),
result.word
);
}
Ok(())
}
const WORDLIST: [&str; 16] = [
"skyscraper",
"camel",
"coal",
"caos",
"copper",
"hello",
"Bugis",
"Kembangan",
"Singapore",
"Fullerton",
"Lavender",
"aircraft",
"boat",
"ship",
"cargo",
"tanker",
];
#[test]
fn test_damerau_levenshtein_strings() {
let mut trie: CaseSensitiveLevTrie = LevTrie::new();
let mut id_map = BTreeMap::<String, usize>::new();
for word in WORDLIST {
let (_, id) = trie.add(word.chars());
id_map.insert(String::from(word), id);
}
let results = trie.fuzzy_search::<DamerauLevenshteinDistanceCalculator>(
&"coat".chars().collect::<Vec<char>>(),
6,
);
print_search_results(&trie, &results, "").unwrap();
let expected_results = ExpectedResults::new(
&id_map,
&[
(String::from("coal"), 1),
(String::from("boat"), 1),
(String::from("caos"), 3),
(String::from("camel"), 4),
(String::from("copper"), 4),
(String::from("ship"), 4),
],
);
expected_results.check(&results);
}
#[test]
fn test_levenshtein_strings() {
let mut trie: CaseSensitiveLevTrie = LevTrie::new();
let mut id_map = BTreeMap::<String, usize>::new();
for word in WORDLIST {
let (_, id) = trie.add(word.chars());
id_map.insert(String::from(word), id);
}
let results = trie
.fuzzy_search::<LevenshteinDistanceCalculator>(&"coat".chars().collect::<Vec<char>>(), 6);
print_search_results(&trie, &results, "").unwrap();
let expected_results = ExpectedResults::new(
&id_map,
&[
(String::from("coal"), 1),
(String::from("boat"), 1),
(String::from("caos"), 3),
(String::from("camel"), 4),
(String::from("copper"), 4),
(String::from("ship"), 4),
],
);
expected_results.check(&results);
}
+225
View File
@@ -0,0 +1,225 @@
use super::keychecker::KeyChecker;
use super::trienode::TrieKey;
use super::trienode::TrieNode;
use std::collections::BTreeSet;
use std::iter::Iterator;
use std::marker::PhantomData;
pub enum VisitOutcome {
Continue,
Skip,
EarlyExit,
}
pub struct Trie<KEY, KEYCHECKER>
where
KEY: TrieKey,
KEYCHECKER: KeyChecker<KEY>,
{
pub(crate) nodes: Vec<TrieNode<KEY>>,
tails: BTreeSet<usize>,
checker: PhantomData<KEYCHECKER>,
}
impl<KEY, KEYCHECKER> Default for Trie<KEY, KEYCHECKER>
where
KEY: TrieKey,
KEYCHECKER: KeyChecker<KEY>,
{
fn default() -> Self {
Trie {
nodes: vec![TrieNode::new0(None)],
tails: BTreeSet::new(),
checker: PhantomData,
}
}
}
impl<KEY, KEYCHECKER> Trie<KEY, KEYCHECKER>
where
KEY: TrieKey,
KEYCHECKER: KeyChecker<KEY>,
{
pub fn trie_from_words<T, U>(wordlist: U) -> Trie<KEY, KEYCHECKER>
where
T: IntoIterator<Item = KEY>,
U: IntoIterator<Item = T>,
{
let mut result = Trie::default();
for word in wordlist {
result.add(word);
}
result
}
pub(crate) fn get_node_mut(&mut self, index: usize) -> &mut TrieNode<KEY> {
&mut self.nodes[index]
}
pub(crate) fn get_node(&self, index: usize) -> &TrieNode<KEY> {
&self.nodes[index]
}
pub(crate) fn nodes(&self) -> usize {
self.nodes.len()
}
fn add_node(&mut self, key: Option<KEY>, parent: usize, prev: Option<usize>) -> usize {
let mut result = TrieNode::new0(key);
let result_index = self.nodes();
result.parent = Some(parent);
match prev {
Some(prev_node) => {
self.get_node_mut(prev_node).next = Some(result_index);
result.prev = prev;
}
None => {
let parent_node = self.get_node_mut(parent);
match parent_node.child {
None => {
parent_node.child = Some(result_index);
}
Some(parent_child) => {
let mut node = parent_child;
loop {
let next = self.get_node(node).next;
match next {
Some(next_node) => {
node = next_node;
}
None => {
break;
}
}
}
self.get_node_mut(node).next = Some(result_index);
result.prev = Some(node)
}
}
}
}
self.nodes.push(result);
result_index
}
pub fn add<T>(&mut self, path: T) -> (bool, usize)
where
T: IntoIterator<Item = KEY>,
{
let mut result = false;
let mut pnode = 0;
'wordLoop: for key in path {
let mut cnode = self.get_node(pnode).child;
while let Some(cnode_index) = cnode {
let cnode_node = self.get_node(cnode_index);
if KEYCHECKER::check(cnode_node.key, Some(key)) {
pnode = cnode_index;
continue 'wordLoop;
} else if self.get_node(cnode_index).next.is_none() {
break;
} else {
cnode = self.get_node(cnode_index).next;
}
}
pnode = self.add_node(Some(key), pnode, cnode);
result = true;
}
if result {
let tail = self.add_node(None, pnode, None);
self.tails.insert(tail);
let mut node = Some(tail);
while let Some(n) = node {
let current_node = self.get_node_mut(n);
current_node.ref_count += 1;
node = current_node.parent;
}
(true, tail)
} else {
(false, pnode)
}
}
pub fn search(&mut self, path: Vec<KEY>) -> Option<usize> {
let mut result: Option<usize> = None;
let visit_pre = |stack: &Vec<usize>| -> VisitOutcome {
if stack.len() == 1 {
VisitOutcome::Continue
} else {
let last = *stack.last().expect("");
let index = stack.len() - 2;
let node = self.get_node(last);
if index < path.len() {
if KEYCHECKER::check(node.key, Some(path[index])) {
VisitOutcome::Continue
} else {
VisitOutcome::Skip
}
} else {
if node.key.is_none() {
result = Some(last);
}
VisitOutcome::EarlyExit
}
}
};
let visit_post = |_: &Vec<usize>| {};
self.walk(visit_pre, visit_post);
result
}
pub fn lineal_descendant(&self, start: usize) -> impl Iterator<Item = &KEY> {
let mut nodes: Vec<usize> = vec![];
let mut node_option = Some(start);
while let Some(node) = node_option {
let key = &self.get_node(node).key;
if key.is_some() {
nodes.push(node);
}
node_option = self.get_node(node).parent;
}
nodes
.into_iter()
.rev()
.map(|node_index| self.get_node(node_index).key.as_ref().unwrap())
}
pub(crate) fn walk<CB1, CB2>(&self, mut visit_pre: CB1, mut visit_post: CB2)
where
CB1: FnMut(&Vec<usize>) -> VisitOutcome,
CB2: FnMut(&Vec<usize>),
{
let mut stack: Vec<(usize, Option<usize>)> = vec![];
let mut public_stack: Vec<usize> = vec![];
let root_node = self.get_node(0);
stack.push((0, root_node.child));
public_stack.push(0);
visit_pre(&public_stack);
while !stack.is_empty() {
let last = &mut stack.last_mut().unwrap();
match last.1 {
Some(child_node_id) => {
let child_node = self.get_node(child_node_id);
last.1 = child_node.next;
public_stack.push(child_node_id);
let visit_pre_outcome = visit_pre(&public_stack);
match visit_pre_outcome {
VisitOutcome::Continue => {
stack.push((child_node_id, child_node.child));
}
VisitOutcome::Skip => {
stack.push((child_node_id, None));
}
VisitOutcome::EarlyExit => return,
}
}
None => {
visit_post(&public_stack);
stack.pop();
public_stack.pop();
}
}
}
}
pub fn tails(&self) -> &BTreeSet<usize> {
&self.tails
}
}
+43
View File
@@ -0,0 +1,43 @@
trait_group! {
pub trait TrieKey : std::marker::Copy + std::fmt::Display + Sized
}
pub struct TrieNode<KEY>
where
KEY: TrieKey,
{
pub key: Option<KEY>,
pub prev: Option<usize>,
pub next: Option<usize>,
pub child: Option<usize>,
pub parent: Option<usize>,
pub (crate) ref_count: usize,
}
impl<KEY> TrieNode<KEY>
where
KEY: TrieKey,
{
fn new(
key: Option<KEY>,
prev: Option<usize>,
next: Option<usize>,
parent: Option<usize>,
child: Option<usize>,
) -> TrieNode<KEY>
{
TrieNode {
key,
prev,
next,
child,
parent,
ref_count: 0,
}
}
pub (crate) fn new0(key: Option<KEY>) -> TrieNode<KEY>
{
TrieNode::new(key, None, None, None, None)
}
}