Skip to content

Commit

Permalink
add csv export
Browse files Browse the repository at this point in the history
  • Loading branch information
raphaellaude committed Nov 3, 2023
1 parent a816ab8 commit 0a56c93
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 29 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/target
*/tmp
scratch.rs
data_prep
data_prep
*.csv
22 changes: 22 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ unicode-normalization = "0.1.22"
clap = { version = "4.4.6", features = ["derive"] }
crfsuite = "0.3.1"
xml-rs = "0.8.19"
csv = "1.3.0"

[dev-dependencies]
criterion = "0.5.1"
Expand Down
70 changes: 43 additions & 27 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,33 +10,35 @@ pub mod train;

use abbreviations::{DIRECTIONALS, STREET_NAMES};

pub enum Tag {
AddressNumberPrefix,
AddressNumber,
AddressNumberSuffix,
StreetNamePreModifier,
StreetNamePreDirectional,
StreetNamePreType,
StreetName,
StreetNamePostType,
StreetNamePostDirectional,
SubaddressType,
SubaddressIdentifier,
BuildingName,
OccupancyType,
OccupancyIdentifier,
CornerOf,
LandmarkName,
PlaceName,
StateName,
ZipCode,
USPSBoxType,
USPSBoxID,
USPSBoxGroupType,
USPSBoxGroupID,
IntersectionSeparator,
Recipient,
NotAddress,
lazy_static! {
pub static ref TAGS: [&'static str; 26] = [
"AddressNumberPrefix",
"AddressNumber",
"AddressNumberSuffix",
"StreetNamePreModifier",
"StreetNamePreDirectional",
"StreetNamePreType",
"StreetName",
"StreetNamePostType",
"StreetNamePostDirectional",
"SubaddressType",
"SubaddressIdentifier",
"BuildingName",
"OccupancyType",
"OccupancyIdentifier",
"CornerOf",
"LandmarkName",
"PlaceName",
"StateName",
"ZipCode",
"USPSBoxType",
"USPSBoxID",
"USPSBoxGroupType",
"USPSBoxGroupID",
"IntersectionSeparator",
"Recipient",
"NotAddress",
];
}

lazy_static! {
Expand All @@ -55,6 +57,20 @@ pub fn parse(address: &str) -> Vec<(String, String)> {
zip_tokens_and_tags(tokens, tags)
}

pub fn parse_addresses(addresses: Vec<&str>) -> Vec<Vec<(String, String)>> {
addresses
.iter() // .iter is 42% faster than .par_iter()
.map(|address| parse(address))
.collect()
}

pub fn parse_addresses_from_txt(file_path: &str) -> Vec<Vec<(String, String)>> {
let raw_data = std::fs::read_to_string(file_path).unwrap();
let data: Vec<&str> = raw_data.lines().collect();

parse_addresses(data)
}

pub fn zip_tokens_and_tags(tokens: Vec<String>, tags: Vec<String>) -> Vec<(String, String)> {
tokens.into_iter().zip(tags.into_iter()).collect()
}
Expand Down
34 changes: 33 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
use clap::Parser;
use us_addrs::parse;
use us_addrs::train::train_model;
use us_addrs::{parse, parse_addresses_from_txt, TAGS};

// use std::path::PathBuf;

#[derive(Parser)]
enum USAddrsCli {
Train(TrainArgs),
Parse(ParseArgs),
ParseFile(ParseFileArgs),
}

#[derive(Parser)]
Expand All @@ -22,12 +23,43 @@ struct ParseArgs {
address: String,
}

#[derive(Parser)]
struct ParseFileArgs {
#[clap(short, long)]
file_path: String,
export_path: String,
}

fn main() {
match USAddrsCli::parse() {
USAddrsCli::Train(args) => match train_model(&args.export_path) {
Ok(()) => println!("Trained model"),
Err(e) => println!("Error training model: {}", e),
},
USAddrsCli::ParseFile(args) => {
let parsed_addresses = parse_addresses_from_txt(&args.file_path);
// write as CSV with Tags as columns
let mut wtr = csv::Writer::from_path(&args.export_path).unwrap();

wtr.write_record(TAGS.iter()).unwrap();

for tagged_address in parsed_addresses {
let mut record = Vec::new();

for tag in TAGS.iter() {
if let Some((token, _)) = tagged_address
.iter()
.find(|&(_, token_tag)| *token_tag == *tag)
{
record.push(token.to_string());
} else {
record.push("".to_string());
}
}
wtr.write_record(&record).unwrap();
}
wtr.flush().unwrap();
}
USAddrsCli::Parse(args) => {
let parsed = parse(&args.address);
println!("{:?}", parsed);
Expand Down

0 comments on commit 0a56c93

Please sign in to comment.