diff --git a/.gitignore b/.gitignore index 8757073..5df95de 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,7 @@ */tmp scratch.rs data_prep -*.csv \ No newline at end of file +*.csv +.DS_Store +profile.json +/scratch \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 325d569..874bc99 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -137,6 +137,8 @@ pub fn tokenize(address: &str) -> Vec { let address: String = clean_address(address); address + .replace('&', " & ") + .replace('#', " # ") .split([' ', ',', ';', ')', '\n'].as_ref()) .filter(|x| !x.is_empty()) .map(|s| s.to_string()) diff --git a/tests/test_tokenize.rs b/tests/test_tokenize.rs new file mode 100644 index 0000000..91647df --- /dev/null +++ b/tests/test_tokenize.rs @@ -0,0 +1,22 @@ +use us_addrs::tokenize; + +#[test] +fn test_tokenizing() { + let mut tokens = tokenize("# 1 abc st"); + assert_eq!(tokens, vec!["#", "1", "abc", "st"]); + + tokens = tokenize("#1 abc st"); + assert_eq!(tokens, vec!["#", "1", "abc", "st"]); + + tokens = tokenize("box # 1 abc st"); + assert_eq!(tokens, vec!["box", "#", "1", "abc", "st"]); + + tokens = tokenize("box #1 abc st"); + assert_eq!(tokens, vec!["box", "#", "1", "abc", "st"]); + + tokens = tokenize("box# 1 abc st"); + assert_eq!(tokens, vec!["box", "#", "1", "abc", "st"]); + + tokens = tokenize("box#1 abc st"); + assert_eq!(tokens, vec!["box", "#", "1", "abc", "st"]); +}