lets_fucking_gooooooooo,the_special_iterator_is_fine_AF

Harsha-vardhan-R · Sep 29, 2023 · 512942f · 512942f · Harsha-vardhan-R · Sep 29, 2023
1 parent bef28f0
commit 512942f
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 25 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,7 +14,7 @@ rand = "*"
 rayon = "*"
 
 [profile.dev]
-opt-level = 3
+opt-level = 0
 
 [profile.release]
 opt-level = 3
diff --git a/src/feature_extraction/tokenisation.rs b/src/feature_extraction/tokenisation.rs
@@ -3,8 +3,6 @@
 
 use std::collections::HashMap;
 use std::collections::hash_map::Entry;
-use rand::seq::index;
-
 use crate::data_frame::data_type::data_type;
 use crate::data_frame::data_frame::data_frame;
 
@@ -14,6 +12,8 @@ pub struct Tokens<'a> {
     token_distribution: Vec<usize>,//store the rarity of the each word in a row matrix.
 }
 
+
+//special whitespace and special character dividing iterator.
 pub struct SpecialStr<'a> {
     string: &'a str,
     back: usize,//index of the back of the &str substring.
@@ -28,30 +28,43 @@ impl<'a> SpecialStr<'a> {
     }
 }
 
-//to split at all the places which are not 
+//to split at all the places which are special.
+//but we need to give some special importance to the '?', '!', '', ''
+
 fn is_special(c: char) -> bool {
-    !c.is_ascii_alphanumeric()
+    !c.is_ascii_alphanumeric() && !c.is_whitespace()
 }
 
+//this iterator divides into "I am an assHoLe!." into an iterator which gives out ("i" , "am", "an" , "asshole" , "!", ".")
+//we are going to split at the whitespaces and any special characters.
+
 impl<'a> Iterator for SpecialStr<'a> {
     type Item = &'a str;
 
     fn next(&mut self) -> Option<Self::Item> {
         let input_string: &str = self.string;
         let max_index = self.string.len();
 
-        for i in self.back..max_index {
-            if !is_special(self.string.chars().nth(i).unwrap()) {
-                for j in (i+1)..max_index {
-                    if is_special(self.string.chars().nth(j).unwrap()) || j == max_index {
-                        self.back = j;
-                        return Some(&input_string[i..j]);
+        for front in self.back..max_index {
+            //if the present char is a special character just return it by itself.
+            if is_special(self.string.chars().nth(front).unwrap()) {
+                self.back += 1;
+                return Some(&input_string[self.back-1..self.back]);
+            } else if !self.string.chars().nth(front).unwrap().is_whitespace() {
+                //if it is not a special character then we are going to select a substring whose end will be at the one before the next following special character
+                for back in front+1..max_index {
+                    if is_special(self.string.chars().nth(back).unwrap()) || self.string.chars().nth(back).unwrap().is_whitespace() || back == max_index-1 {
+                        self.back = back;
+                        return Some(&input_string[front..back]);
                     }
                 }
+            } else {
+                self.back += 1;
             }
         }
 
         None
+
     }
 }
 
@@ -67,37 +80,41 @@ impl<'a> Tokens<'a> {
     }
 
     //possible only for the string data type.
-    pub fn tokenise(&mut self, frame : &data_frame, column_index : usize) {
+    pub fn tokenise(&mut self, frame : &'a data_frame, column_index : usize) {
 
         //not preallocating the memory because we do not know the number of individual words we are going to come across.
         let mut token_distribution: Vec<usize> = vec![];
         let mut count: usize = 0_usize;
-        let mut sparse: Vec<Vec<usize>> = vec![vec![]];
+        let mut sparse: Vec<Vec<usize>> = vec![vec![0_usize ; frame.number_of_samples.try_into().unwrap()]];
         let mut mapper: HashMap<&str, usize> = HashMap::new();
 
-      /*   match &frame.data[column_index] {
+        match &frame.data[column_index] {
             data_type::Strings(temp) => {
                 //for each sentence in the given column.
-                for sentence in temp {
+                for (i , sentence) in temp.iter().enumerate() {
                     //comparing each word after making it lowercase.
-                    for word in sentence.split_whitespace().into_iter() {
+                    for word in SpecialStr::new(&sentence).into_iter() {
                         //if the word is already occupied then we are going to just add 1 to the token distribution at that index
                         //or we are going to insert this value with the value 1. 
-                        match  self.column_index.entry(word) {
+                        match self.column_index.entry(word) {
                             Entry::Occupied(temp) => {
-                                let &index_of = temp.get();//where do we need to locate the 
+                                let &index_of = temp.get();//where do we need to locate the word. 
                                 token_distribution[index_of] += 1;
-                                //sparse[index_of][]
+                                sparse[index_of][i] += 1;
                             },
-                            Entry::Vacant(entry) => {
-                                
+                            Entry::Vacant(mut entry) => {
+                                entry.insert(count);
+                                token_distribution.push(1);
+                                sparse.push(vec![0_usize ; frame.number_of_samples.try_into().unwrap()]);
+                                sparse[count][i] += 1;
+                                count += 1;
                             },
                         }
                     }
                 }
             },
             _ => panic!("You cannot tokenise the float or the category data type"),
-        } */
+        } 
     }
 
 

diff --git a/src/feature_extraction/tokenisation_test.rs b/src/feature_extraction/tokenisation_test.rs
@@ -1,14 +1,28 @@
 #[cfg(test)]
 
-#[test]
 
+#[test]
 fn divide_n_print() {
     use super::tokenisation::SpecialStr;
 
-    let input = "**Tha&khva a#% vhbavbevb{}pp'llop'.";
+    let input = "happy bday. .i.";
+    let new_one = SpecialStr::new(&input);
+
+    let temp: Vec<&str> = new_one.into_iter().collect();
+    print!("{:?}", temp);
+    //assert_eq!(temp , vec![]);
+}
+
+
+fn divide_n_print_2() {
+    use super::tokenisation::SpecialStr;
+
+    let input = "hvcvk (vuk == gvyv'' jvv( hvktvk";
     let new_one = SpecialStr::new(&input);
 
     let temp: Vec<&str> = new_one.into_iter().collect();
 
-    assert_eq!(temp , vec!["Tha", "khva", "a", "vhbavbevb", "pp", "llop"]);
+    println!("{:?}", temp);
+
+    //assert_eq!(temp , vec!["hvcvk", "vuk", "gvyv", "jvv", "hvktvk"]);
 }