Skip to content

Commit

Permalink
add encoding harf muqotto'ah
Browse files Browse the repository at this point in the history
  • Loading branch information
alpancs committed Sep 23, 2024
1 parent 068139a commit 135f8f0
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 17 deletions.
58 changes: 56 additions & 2 deletions quranize/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,22 @@ mod suffix_tree;
mod transliteration;

use suffix_tree::Edge;
use transliteration::{contextual_map, map};
use transliteration::{contextual_map, harf_muqottoah_map, map};

type EncodeResults = Vec<(String, usize, Vec<&'static str>)>;
type PrevMap = (char, &'static str);

const AYA_COUNT: usize = 6236;
const SURA_STARTS: [usize; 114] = [
0, 7, 293, 493, 669, 789, 954, 1160, 1235, 1364, 1473, 1596, 1707, 1750, 1802, 1901, 2029,
2140, 2250, 2348, 2483, 2595, 2673, 2791, 2855, 2932, 3159, 3252, 3340, 3409, 3469, 3503, 3533,
3606, 3660, 3705, 3788, 3970, 4058, 4133, 4218, 4272, 4325, 4414, 4473, 4510, 4545, 4583, 4612,
4630, 4675, 4735, 4784, 4846, 4901, 4979, 5075, 5104, 5126, 5150, 5163, 5177, 5188, 5199, 5217,
5229, 5241, 5271, 5323, 5375, 5419, 5447, 5475, 5495, 5551, 5591, 5622, 5672, 5712, 5758, 5800,
5829, 5848, 5884, 5909, 5931, 5948, 5967, 5993, 6023, 6043, 6058, 6079, 6090, 6098, 6106, 6125,
6130, 6138, 6146, 6157, 6168, 6176, 6179, 6188, 6193, 6197, 6204, 6207, 6213, 6216, 6221, 6225,
6230,
];
const QURAN_TXT: &str = include_str!("quran-uthmani-min.txt");

/// Struct to encode alphabetic text to quran text.
Expand Down Expand Up @@ -58,11 +68,19 @@ impl Quranize {
pub fn encode(&self, s: &str) -> EncodeResults {
{
let s = normalization::normalize(s);
{ self.tree.edges_from(0) }
self.tree
.edges_from(0)
.flat_map(|&e| self.rev_encode(&s, e, None))
.collect::<Vec<_>>()
}
.into_iter()
.chain({
let s = normalization::normalize_muqottoah(s);
self.tree
.edges_from(0)
.flat_map(|&e| self.rev_encode_muqottoah(&s, e))
.collect::<Vec<_>>()
})
.map(|(q, n, e)| (q.chars().rev().collect(), n, e.into_iter().rev().collect()))
.collect()
}
Expand Down Expand Up @@ -92,6 +110,35 @@ impl Quranize {
});
results_iter.collect()
}

fn rev_encode_muqottoah(&self, s: &str, (v, w, l): Edge) -> EncodeResults {
let results_iter = l.chars().next().into_iter().flat_map(|c| -> EncodeResults {
let tsls = harf_muqottoah_map(c).iter();
let tsl_results_iter = tsls.filter_map(|&tsl| -> Option<EncodeResults> {
s.strip_prefix(tsl).map(|s| match s {
"" => match self.tree.vertices[w].2 {
true => vec![(c.to_string(), self.tree.count_data(w), vec![tsl])],
false => vec![],
},
s => match &l[c.len_utf8()..] {
"" => { self.tree.edges_from(w) }
.flat_map(|&e| self.rev_encode_muqottoah(s, e))
.collect(),
l => self.rev_encode_muqottoah(s, (v, w, l)),
}
.into_iter()
.map(|(mut q, n, mut e)| {
q.push(c);
e.push(tsl);
(q, n, e)
})
.collect(),
})
});
tsl_results_iter.flatten().collect()
});
results_iter.collect()
}
}

impl Default for Quranize {
Expand All @@ -117,11 +164,18 @@ mod tests {
assert_eq!(q.e("bismillahirrohmanirrohiim"), ["بِسمِ اللَّهِ الرَّحمٰنِ الرَّحيم"]);
}

#[test]
fn test_muqottoah() {
let q = Quranize::new();
assert_eq!(q.e("alif lam mim"), ["الم"]);
}

#[test]
fn test_suffix_tree_props() {
let t = Quranize::new().tree;
assert_eq!(t.vertices.len(), t.edges.len() + 1);
assert_eq!(t.count_data(0), t.collect_data(0).len());
assert_eq!(t.vertices.len(), Quranize::EXPECTED_VERTEX_COUNT);
assert!(t.vertices[0].2);
}
}
18 changes: 9 additions & 9 deletions quranize/src/normalization/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pub(super) fn normalize(text: &str) -> String {
.collect()
}

pub(super) fn normalize_first_aya(text: &str) -> String {
pub(super) fn normalize_muqottoah(text: &str) -> String {
let mut chars = Vec::from_iter(text.chars().filter_map(|c| match c.to_ascii_lowercase() {
c @ ('a'..='z' | '\'' | ' ') => Some(c),
_ => None,
Expand Down Expand Up @@ -35,19 +35,19 @@ mod tests {

#[test]
fn test_normalize_first_aya() {
assert_eq!("", normalize_first_aya(""));
assert_eq!("alif", normalize_first_aya("alif"));
assert_eq!("lam", normalize_first_aya("laam"));
assert_eq!("lam", normalize_first_aya("laaam"));
assert_eq!("lam", normalize_first_aya("laaaam"));
assert_eq!("lam", normalize_first_aya("laaaam"));
assert_eq!("", normalize_muqottoah(""));
assert_eq!("alif", normalize_muqottoah("alif"));
assert_eq!("lam", normalize_muqottoah("laam"));
assert_eq!("lam", normalize_muqottoah("laaam"));
assert_eq!("lam", normalize_muqottoah("laaaam"));
assert_eq!("lam", normalize_muqottoah("laaaam"));
assert_eq!(
"kafhayaainshod",
normalize_first_aya("kaaaf haa yaa aiiin shoood"),
normalize_muqottoah("kaaaf haa yaa aiiin shoood"),
);
assert_eq!(
"kafhaya'ainshod",
normalize_first_aya("kaaaf haa yaa 'aiiin shoood"),
normalize_muqottoah("kaaaf haa yaa 'aiiin shoood"),
);
}
}
23 changes: 17 additions & 6 deletions quranize/src/suffix_tree/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::collections::{btree_set::Range, BTreeSet};

mod suffix_iter;

type Vertex = (Option<Index>, usize);
type Vertex = (Option<Index>, usize, bool);
type Index = (usize, usize);
pub(super) type Edge<'a> = (usize, usize, &'a str);

Expand All @@ -14,7 +14,7 @@ pub(super) struct SuffixTree<'a> {
impl<'a> SuffixTree<'a> {
pub(super) fn with_capacity(capacity: usize) -> Self {
let mut vertices = Vec::with_capacity(capacity);
vertices.push((None, 0));
vertices.push((None, 0, false));
let edges = Default::default();
Self { vertices, edges }
}
Expand All @@ -24,26 +24,37 @@ impl<'a> SuffixTree<'a> {
}

fn construct_suffix(&mut self, i: Index, v: usize, s: &'a str) {
match { self.edges_from(v) }.find_map(|&e| Some(e).zip(Self::longest_prefix(s, e.2))) {
let edge_prefix_pair = self
.edges_from(v)
.find_map(|&(v, w, l)| Some((v, w, l)).zip(Self::longest_prefix(s, l)));
match edge_prefix_pair {
Some(((_, w, l), p)) if l.len() == p.len() && s.len() > p.len() => {
self.construct_suffix(i, w, &s[p.len()..]);
self.vertices[v].2 |= self.vertices[w].2;
}
Some(((v, w, l), p)) => {
self.edges.remove(&(v, w, l));
let x = self.add_vertex((None, self.vertices[w].1 + 1));
let y = self.add_vertex((Some(i), 1));
let x = self.add_vertex((None, self.vertices[w].1 + 1, false));
let y = self.add_vertex((Some(i), 1, Self::starting_sura(i.0)));
self.edges.insert((v, x, p));
self.edges.insert((x, w, &l[p.len()..]));
self.edges.insert((x, y, &s[p.len()..]));
self.vertices[x].2 = self.vertices[w].2 || self.vertices[y].2;
self.vertices[v].2 |= self.vertices[x].2;
}
None => {
let w = self.add_vertex((Some(i), 1));
let w = self.add_vertex((Some(i), 1, Self::starting_sura(i.0)));
self.edges.insert((v, w, s));
self.vertices[v].2 |= self.vertices[w].2;
}
}
self.vertices[v].1 += 1;
}

fn starting_sura(i: usize) -> bool {
crate::SURA_STARTS.binary_search(&i).is_ok()
}

pub(super) fn edges_from(&self, v: usize) -> Range<Edge<'a>> {
self.edges.range((v, 0, "")..(v + 1, 0, ""))
}
Expand Down

0 comments on commit 135f8f0

Please sign in to comment.