From 8895e8635ba87592a6c1b5c6d0ad478b5235cc80 Mon Sep 17 00:00:00 2001 From: Alfan Nur Fauzan Date: Tue, 24 Sep 2024 00:27:20 +0700 Subject: [PATCH] trim basmalah and implement find --- quranize/src/lib.rs | 39 ++++++++++++++++++++++++++++++--- quranize/src/suffix_tree/mod.rs | 2 +- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/quranize/src/lib.rs b/quranize/src/lib.rs index df48220..2d094d1 100644 --- a/quranize/src/lib.rs +++ b/quranize/src/lib.rs @@ -22,7 +22,7 @@ mod normalization; mod suffix_tree; mod transliteration; -use suffix_tree::Edge; +use suffix_tree::{Edge, Index}; use transliteration::{contextual_map, harf_muqottoah_map, map}; type EncodeResults = Vec<(String, usize, Vec<&'static str>)>; @@ -47,7 +47,7 @@ pub struct Quranize { } impl Quranize { - const EXPECTED_VERTEX_COUNT: usize = 126_327; + const EXPECTED_VERTEX_COUNT: usize = 125_595; /// Create a new [`Quranize`] instance. /// @@ -61,10 +61,21 @@ impl Quranize { let mut tree = suffix_tree::SuffixTree::with_capacity(Self::EXPECTED_VERTEX_COUNT); (0..AYA_COUNT) .zip(QURAN_TXT.split_inclusive('\n')) + .map(|(i, s)| (i, Self::trim_basmalah(i, s))) .for_each(|(i, s)| tree.construct(i, s)); Self { tree } } + fn trim_basmalah(i: usize, s: &str) -> &str { + const START_ALFATIHAH: usize = SURA_STARTS[0]; + const START_ATTAUBAH: usize = SURA_STARTS[8]; + match (i, SURA_STARTS.binary_search(&i).is_ok()) { + (START_ALFATIHAH | START_ATTAUBAH, _) => s, + (_, true) => s.splitn(5, ' ').last().unwrap(), + _ => s, + } + } + pub fn encode(&self, s: &str) -> EncodeResults { let mut results: EncodeResults = match normalization::normalize(s).as_str() { "" => vec![], @@ -139,6 +150,10 @@ impl Quranize { }); results_iter.collect() } + + pub fn find(&self, s: &str) -> Vec { + self.tree.find(0, s) + } } impl Default for Quranize { @@ -262,7 +277,25 @@ mod tests { } #[test] - fn test_suffix_tree_props() { + fn test_tree_find() { + let q = Quranize::new(); + assert!(q.find("بِسمِ").contains(&(0, 0))); + assert_eq!(q.find("وَالنّاسِ").last(), Some(&(6235, 28))); + assert!(q.find("الم").contains(&(7, 0))); + assert_eq!(q.find("بِسمِ اللَّهِ الرَّحمٰنِ الرَّحيمِ").len(), 2); + assert!(q.find("").is_empty()); + assert!(q.find("نن").is_empty()); + assert!(q.find("ننن").is_empty()); + assert!(q.find("نننن").is_empty()); + assert!(q.find("2+3+4=9").is_empty()); + assert_eq!(q.find("بِسمِ اللَّهِ الرَّحمٰنِ الرَّحيم").first(), Some(&(0, 0))); + assert_eq!(q.find("الرَّحمٰنِ الرَّحيم").first(), Some(&(0, 26))); + assert_eq!(q.find("").first(), None); + assert_eq!(q.find("abc").first(), None); + } + + #[test] + fn test_tree_props() { let t = Quranize::new().tree; assert_eq!(t.vertices.len(), t.edges.len() + 1); assert_eq!(t.count_data(0), t.collect_data(0).len()); diff --git a/quranize/src/suffix_tree/mod.rs b/quranize/src/suffix_tree/mod.rs index b53e0ca..1d3e2e0 100644 --- a/quranize/src/suffix_tree/mod.rs +++ b/quranize/src/suffix_tree/mod.rs @@ -3,7 +3,7 @@ use std::collections::{btree_set::Range, BTreeSet}; mod suffix_iter; type Vertex = (Option, usize, bool); -type Index = (usize, usize); +pub(super) type Index = (usize, usize); pub(super) type Edge<'a> = (usize, usize, &'a str); pub(super) struct SuffixTree<'a> {