Skip to content

Commit

Permalink
[ucd/unihan] Add the unihan crate
Browse files Browse the repository at this point in the history
  • Loading branch information
eyeplum committed Mar 4, 2019
1 parent 89f5f55 commit fa97f5c
Show file tree
Hide file tree
Showing 38 changed files with 200,877 additions and 0 deletions.
1 change: 1 addition & 0 deletions gen/src/source/ucd/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pub mod prop_list;
pub mod readme;
pub mod sentence_break_property;
pub mod unicode_data;
pub mod unihan;
pub mod word_break_property;

use regex::Regex;
Expand Down
65 changes: 65 additions & 0 deletions gen/src/source/ucd/unihan/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

pub mod numeric_values;
pub mod readings;
pub mod variants;

use std::char;
use std::collections::BTreeMap;

use regex::Regex;

lazy_static! {
pub static ref UNIHAN_DATA_ENTRY_REGEX: Regex = Regex::new(
r"(?xm)^ # every line
U\+([[:xdigit:]]{4,6}) # [1]codepoint
\t # separator
(k[a-zA-Z0-9_]+) # [2]field key
\t # separator
(.*) # [3]field value
",
)
.unwrap();
}

pub trait DataEntry {
fn new(character: char) -> Self;
fn update<'a>(&mut self, key: &'a str, value: &'a str);
}

pub fn parse_entries_from_str<T>(str: &str) -> Vec<T>
where
T: DataEntry + Clone,
{
let mut entry_map: BTreeMap<char, T> = BTreeMap::default();

for capture in UNIHAN_DATA_ENTRY_REGEX.captures_iter(str) {
let code_point = u32::from_str_radix(&capture[1], 16).unwrap();
let chr = char::from_u32(code_point).unwrap();

let key = &capture[2];
let value = &capture[3];

match entry_map.get(&chr) {
None => {
let mut entry = T::new(chr);
entry.update(key, value);
entry_map.insert(chr, entry);
}
Some(_) => {
let entry = entry_map.get_mut(&chr).unwrap();
entry.update(key, value);
}
}
}

entry_map.values().cloned().collect::<Vec<T>>()
}
96 changes: 96 additions & 0 deletions gen/src/source/ucd/unihan/numeric_values.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms

use std::str::FromStr;

use crate::source::utils::read;

use super::{parse_entries_from_str, DataEntry};

lazy_static! {
/// [Numeric values]: http://www.unicode.org/reports/tr38/#N1024D
pub static ref UNIHAN_NUMERIC_VALUES_DATA: NumericValuesData = {
read("external/unicode/ucd/data/Unihan/Unihan_NumericValues.txt").parse().unwrap()
};
}

#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct NumericValuesDataEntry {
pub character: char,
pub accounting_numeric: Option<u64>,
pub other_numeric: Option<u64>,
pub primary_numeric: Option<u64>,
}

impl DataEntry for NumericValuesDataEntry {
fn new(character: char) -> NumericValuesDataEntry {
NumericValuesDataEntry {
character: character,
accounting_numeric: None,
other_numeric: None,
primary_numeric: None,
}
}

fn update<'a>(&mut self, key: &'a str, value: &'a str) {
match key {
"kAccountingNumeric" => self.accounting_numeric = value.parse::<u64>().ok(),
"kOtherNumeric" => self.other_numeric = value.parse::<u64>().ok(),
"kPrimaryNumeric" => self.primary_numeric = value.parse::<u64>().ok(),
_ => {}
}
}
}

#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct NumericValuesData {
pub entries: Box<[NumericValuesDataEntry]>,
}

impl FromStr for NumericValuesData {
type Err = ();

fn from_str(str: &str) -> Result<Self, Self::Err> {
Ok(NumericValuesData {
entries: parse_entries_from_str(str).into_boxed_slice(),
})
}
}

#[cfg(test)]
mod test {
use super::super::DataEntry;
use super::{NumericValuesData, NumericValuesDataEntry};

#[test]
fn data_entry_parse() {
let mut entry1 = NumericValuesDataEntry::new('\u{3405}');
entry1.other_numeric = Some(5);

let mut entry2 = NumericValuesDataEntry::new('\u{4EDF}');
entry2.accounting_numeric = Some(1000);

let mut entry3 = NumericValuesDataEntry::new('\u{5146}');
entry3.primary_numeric = Some(1000000000000);

let entries = vec![entry1, entry2, entry3];

assert_eq!(
"U+3405 kOtherNumeric 5\n\
U+4EDF kAccountingNumeric 1000\n\
U+5146 kPrimaryNumeric 1000000000000\n\
"
.parse(),
Ok(NumericValuesData {
entries: entries.into_boxed_slice(),
}),
);
}
}
92 changes: 92 additions & 0 deletions gen/src/source/ucd/unihan/readings.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms

use std::str::FromStr;

use crate::source::utils::read;

use super::{parse_entries_from_str, DataEntry};

lazy_static! {
/// [Readings]: http://www.unicode.org/reports/tr38/#N1019C
pub static ref UNIHAN_READINGS_DATA: ReadingsData = {
read("external/unicode/ucd/data/Unihan/Unihan_Readings.txt").parse().unwrap()
};
}

#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct ReadingsDataEntry {
pub character: char,
pub cantonese: Option<String>,
pub definition: Option<String>,
pub hangul: Option<String>,
pub hanyu_pinlu: Option<String>,
pub hanyu_pinyin: Option<String>,
pub japanese_kun: Option<String>,
pub japanese_on: Option<String>,
pub korean: Option<String>,
pub mandarin: Option<String>,
pub tang: Option<String>,
pub vietnamese: Option<String>,
pub xhc_1983: Option<String>,
}

impl DataEntry for ReadingsDataEntry {
fn new(character: char) -> ReadingsDataEntry {
ReadingsDataEntry {
character: character,
cantonese: None,
definition: None,
hangul: None,
hanyu_pinlu: None,
hanyu_pinyin: None,
japanese_kun: None,
japanese_on: None,
korean: None,
mandarin: None,
tang: None,
vietnamese: None,
xhc_1983: None,
}
}

fn update<'a>(&mut self, key: &'a str, value: &'a str) {
match key {
"kCantonese" => self.cantonese = Some(value.to_owned()),
"kDefinition" => self.definition = Some(value.to_owned()),
"kHangul" => self.hangul = Some(value.to_owned()),
"kHanyuPinlu" => self.hanyu_pinlu = Some(value.to_owned()),
"kHanyuPinyin" => self.hanyu_pinyin = Some(value.to_owned()),
"kJapaneseKun" => self.japanese_kun = Some(value.to_owned()),
"kJapaneseOn" => self.japanese_on = Some(value.to_owned()),
"kKorean" => self.korean = Some(value.to_owned()),
"kMandarin" => self.mandarin = Some(value.to_owned()),
"kTang" => self.tang = Some(value.to_owned()),
"kVietnamese" => self.vietnamese = Some(value.to_owned()),
"kXHC1983" => self.xhc_1983 = Some(value.to_owned()),
_ => {}
}
}
}

#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct ReadingsData {
pub entries: Box<[ReadingsDataEntry]>,
}

impl FromStr for ReadingsData {
type Err = ();

fn from_str(str: &str) -> Result<Self, Self::Err> {
Ok(ReadingsData {
entries: parse_entries_from_str(str).into_boxed_slice(),
})
}
}
132 changes: 132 additions & 0 deletions gen/src/source/ucd/unihan/variants.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms

use std::char;
use std::str::FromStr;

use regex::Regex;

use crate::source::utils::read;

use super::{parse_entries_from_str, DataEntry};

lazy_static! {
/// [Variants]: http://www.unicode.org/reports/tr38/#N10211
pub static ref UNIHAN_VARIANTS_DATA: VariantsData = {
read("external/unicode/ucd/data/Unihan/Unihan_Variants.txt").parse().unwrap()
};

pub static ref VALUE_REGEX: Regex = Regex::new(
r"(?x) # extended regex syntax
U\+(2?[[:xdigit:]]{4}) # [1]codepoint
<?( # [2]additional data
k[[:alnum:]]+(:[TBZFJ]+)?(,k[[:alnum:]]+(:[TBZFJ]+)?)*
)?
",
).unwrap();
}

#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct VariantsDataEntry {
pub character: char,
pub semantic_variants: Option<Vec<char>>, // FIXME: handle additional data
pub simplified_variant: Option<char>,
pub specialized_semantic_variants: Option<Vec<char>>, // FIXME: handle additional data
pub traditional_variant: Option<char>,
pub z_variants: Option<Vec<char>>, // FIXME: handle additional data
}

impl VariantsDataEntry {
pub fn parse_value<'a>(str: &'a str) -> char {
let capture = VALUE_REGEX.captures(str).unwrap();
let code_point = u32::from_str_radix(&capture[1], 16).unwrap();
char::from_u32(code_point).unwrap()
}

pub fn parse_values_with_additional_data<'a>(str: &'a str) -> Vec<char> {
let mut chars = vec![];
for capture in VALUE_REGEX.captures_iter(str) {
let code_point = u32::from_str_radix(&capture[1], 16).unwrap();
let chr = char::from_u32(code_point).unwrap();
chars.push(chr);
}
chars
}
}

impl DataEntry for VariantsDataEntry {
fn new(character: char) -> VariantsDataEntry {
VariantsDataEntry {
character: character,
semantic_variants: None,
simplified_variant: None,
specialized_semantic_variants: None,
traditional_variant: None,
z_variants: None,
}
}

fn update<'a>(&mut self, key: &'a str, value: &'a str) {
match key {
"kSemanticVariant" => {
self.semantic_variants =
Some(VariantsDataEntry::parse_values_with_additional_data(value))
}
"kSimplifiedVariant" => {
self.simplified_variant = Some(VariantsDataEntry::parse_value(value))
}
"kSpecializedSemanticVariant" => {
self.specialized_semantic_variants =
Some(VariantsDataEntry::parse_values_with_additional_data(value))
}
"kTraditionalVariant" => {
self.traditional_variant = Some(VariantsDataEntry::parse_value(value))
}
"kZVariant" => {
self.z_variants = Some(VariantsDataEntry::parse_values_with_additional_data(value))
}
_ => {}
}
}
}

#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct VariantsData {
pub entries: Box<[VariantsDataEntry]>,
}

impl FromStr for VariantsData {
type Err = ();

fn from_str(str: &str) -> Result<Self, Self::Err> {
Ok(VariantsData {
entries: parse_entries_from_str(str).into_boxed_slice(),
})
}
}

#[cfg(test)]
mod test {
use super::VariantsDataEntry;

#[test]
fn value_parse() {
let sample_value = "U+54A8";
let chr = VariantsDataEntry::parse_value(sample_value);
assert_eq!(chr, '\u{54A8}')
}

#[test]
fn value_parse_with_additional_data() {
let sample_value = "U+54A8<kMatthews:T,kMeyerWempe U+8AEE<kMatthews,kMeyerWempe";
let chars = VariantsDataEntry::parse_values_with_additional_data(sample_value);
assert_eq!(chars, vec!['\u{54A8}', '\u{8AEE}'])
}
}
Loading

0 comments on commit fa97f5c

Please sign in to comment.