Skip to content

Commit

Permalink
lets_fucking_gooooooooo,the_special_iterator_is_fine_AF
Browse files Browse the repository at this point in the history
  • Loading branch information
Harsha-vardhan-R committed Sep 29, 2023
1 parent bef28f0 commit 512942f
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 25 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ rand = "*"
rayon = "*"

[profile.dev]
opt-level = 3
opt-level = 0

[profile.release]
opt-level = 3
59 changes: 38 additions & 21 deletions src/feature_extraction/tokenisation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
use std::collections::HashMap;
use std::collections::hash_map::Entry;
use rand::seq::index;

use crate::data_frame::data_type::data_type;
use crate::data_frame::data_frame::data_frame;

Expand All @@ -14,6 +12,8 @@ pub struct Tokens<'a> {
token_distribution: Vec<usize>,//store the rarity of the each word in a row matrix.
}


//special whitespace and special character dividing iterator.
pub struct SpecialStr<'a> {
string: &'a str,
back: usize,//index of the back of the &str substring.
Expand All @@ -28,30 +28,43 @@ impl<'a> SpecialStr<'a> {
}
}

//to split at all the places which are not
//to split at all the places which are special.
//but we need to give some special importance to the '?', '!', '', ''

fn is_special(c: char) -> bool {
!c.is_ascii_alphanumeric()
!c.is_ascii_alphanumeric() && !c.is_whitespace()
}

//this iterator divides into "I am an assHoLe!." into an iterator which gives out ("i" , "am", "an" , "asshole" , "!", ".")
//we are going to split at the whitespaces and any special characters.

impl<'a> Iterator for SpecialStr<'a> {
type Item = &'a str;

fn next(&mut self) -> Option<Self::Item> {
let input_string: &str = self.string;
let max_index = self.string.len();

for i in self.back..max_index {
if !is_special(self.string.chars().nth(i).unwrap()) {
for j in (i+1)..max_index {
if is_special(self.string.chars().nth(j).unwrap()) || j == max_index {
self.back = j;
return Some(&input_string[i..j]);
for front in self.back..max_index {
//if the present char is a special character just return it by itself.
if is_special(self.string.chars().nth(front).unwrap()) {
self.back += 1;
return Some(&input_string[self.back-1..self.back]);
} else if !self.string.chars().nth(front).unwrap().is_whitespace() {
//if it is not a special character then we are going to select a substring whose end will be at the one before the next following special character
for back in front+1..max_index {
if is_special(self.string.chars().nth(back).unwrap()) || self.string.chars().nth(back).unwrap().is_whitespace() || back == max_index-1 {
self.back = back;
return Some(&input_string[front..back]);
}
}
} else {
self.back += 1;
}
}

None

}
}

Expand All @@ -67,37 +80,41 @@ impl<'a> Tokens<'a> {
}

//possible only for the string data type.
pub fn tokenise(&mut self, frame : &data_frame, column_index : usize) {
pub fn tokenise(&mut self, frame : &'a data_frame, column_index : usize) {

//not preallocating the memory because we do not know the number of individual words we are going to come across.
let mut token_distribution: Vec<usize> = vec![];
let mut count: usize = 0_usize;
let mut sparse: Vec<Vec<usize>> = vec![vec![]];
let mut sparse: Vec<Vec<usize>> = vec![vec![0_usize ; frame.number_of_samples.try_into().unwrap()]];
let mut mapper: HashMap<&str, usize> = HashMap::new();

/* match &frame.data[column_index] {
match &frame.data[column_index] {
data_type::Strings(temp) => {
//for each sentence in the given column.
for sentence in temp {
for (i , sentence) in temp.iter().enumerate() {
//comparing each word after making it lowercase.
for word in sentence.split_whitespace().into_iter() {
for word in SpecialStr::new(&sentence).into_iter() {
//if the word is already occupied then we are going to just add 1 to the token distribution at that index
//or we are going to insert this value with the value 1.
match self.column_index.entry(word) {
match self.column_index.entry(word) {
Entry::Occupied(temp) => {
let &index_of = temp.get();//where do we need to locate the
let &index_of = temp.get();//where do we need to locate the word.
token_distribution[index_of] += 1;
//sparse[index_of][]
sparse[index_of][i] += 1;
},
Entry::Vacant(entry) => {
Entry::Vacant(mut entry) => {
entry.insert(count);
token_distribution.push(1);
sparse.push(vec![0_usize ; frame.number_of_samples.try_into().unwrap()]);
sparse[count][i] += 1;
count += 1;
},
}
}
}
},
_ => panic!("You cannot tokenise the float or the category data type"),
} */
}
}


Expand Down
20 changes: 17 additions & 3 deletions src/feature_extraction/tokenisation_test.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
#[cfg(test)]

#[test]

#[test]
fn divide_n_print() {
use super::tokenisation::SpecialStr;

let input = "**Tha&khva a#% vhbavbevb{}pp'llop'.";
let input = "happy bday. .i.";
let new_one = SpecialStr::new(&input);

let temp: Vec<&str> = new_one.into_iter().collect();
print!("{:?}", temp);
//assert_eq!(temp , vec![]);
}


fn divide_n_print_2() {
use super::tokenisation::SpecialStr;

let input = "hvcvk (vuk == gvyv'' jvv( hvktvk";
let new_one = SpecialStr::new(&input);

let temp: Vec<&str> = new_one.into_iter().collect();

assert_eq!(temp , vec!["Tha", "khva", "a", "vhbavbevb", "pp", "llop"]);
println!("{:?}", temp);

//assert_eq!(temp , vec!["hvcvk", "vuk", "gvyv", "jvv", "hvktvk"]);
}

1 comment on commit 512942f

@Harsha-vardhan-R
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the modification in the given commit is pretty much like perfectly what i want to acheive by the iterator implementation

Please sign in to comment.