From 7ba3cfccf27c5ef3c9ee54e0cf252083afc48989 Mon Sep 17 00:00:00 2001 From: Raphael Paul Laude Date: Fri, 3 Nov 2023 09:48:20 -0400 Subject: [PATCH] group by tag in csv export --- README.md | 10 ++++++++-- src/lib.rs | 18 ++++++++++++++++++ src/main.rs | 5 +++-- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 795bc07..8f6ffed 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,14 @@ US Addrs is a rust crate for parsing unstructured United States address strings It is a rust implementation of the awesome [usaddress](https://github.com/datamade/usaddress/tree/master) library. Thank you to the folks at [datamade](https://datamade.us/) for releasing such a cool tool. -US Addrs is currently _79% (~5x) faster_ than usaddress, though additional optimizations should be possible. Accuracy stats TK. +US Addrs is currently _79% (~5x) faster_ than usaddress, though additional optimizations should be possible. [Accuracy](#accuracy) is close to usaddress but not quite matching yet. The goal of this implementation is to faciliate use cases requiring better performance, such as geocoding large batches of addresses. :warning: This crate is under **active development** and may not match usaddress's accuracy. US Addrs will be better tested / documented shortly. ## Examples -US Addrs can be run from the command line +US Addrs can be run from the command line to parse an address ```bash cargo run -- parse --address '33 Nassau Avenue, Brooklyn, NY' @@ -20,6 +20,12 @@ cargo run -- parse --address '33 Nassau Avenue, Brooklyn, NY' [("33", "AddressNumber"), ("Nassau", "StreetName"), ("Avenue", "StreetNamePostType"), ("Brooklyn", "PlaceName"), ("NY", "StateName")] ``` +or export a list of addresses to CSV + +```bash +cargo run -- parse-file --file-path tests/test_data/test_addrs.txt test.csv +``` + or by importing the crate and using the `parse` function ```rust diff --git a/src/lib.rs b/src/lib.rs index ed48626..325d569 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -324,3 +324,21 @@ pub fn read_xml_tagged_addresses(file_path: &str) -> (Vec, Vec) -> Vec<(String, String)> { + let mut result = Vec::new(); + let mut tokens = tokens.into_iter().peekable(); + + while let Some((mut token, tag)) = tokens.next() { + while tokens + .peek() + .map_or(false, |(_, ref next_tag)| &tag == next_tag) + { + let (next_token, _) = tokens.next().unwrap(); + token = format!("{} {}", token, next_token); + } + result.push((token, tag)); + } + + result +} diff --git a/src/main.rs b/src/main.rs index e49f361..5a63790 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,6 @@ use clap::Parser; use us_addrs::train::train_model; -use us_addrs::{parse, parse_addresses_from_txt, TAGS}; +use us_addrs::{group_by_tag, parse, parse_addresses_from_txt, TAGS}; // use std::path::PathBuf; @@ -44,10 +44,11 @@ fn main() { wtr.write_record(TAGS.iter()).unwrap(); for tagged_address in parsed_addresses { + let group_tagged_address = group_by_tag(tagged_address); let mut record = Vec::new(); for tag in TAGS.iter() { - if let Some((token, _)) = tagged_address + if let Some((token, _)) = group_tagged_address .iter() .find(|&(_, token_tag)| *token_tag == *tag) {