-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
38 changed files
with
200,877 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
// Copyright 2017 The UNIC Project Developers. | ||
// | ||
// See the COPYRIGHT file at the top-level directory of this distribution. | ||
// | ||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
// option. This file may not be copied, modified, or distributed | ||
// except according to those terms. | ||
|
||
pub mod numeric_values; | ||
pub mod readings; | ||
pub mod variants; | ||
|
||
use std::char; | ||
use std::collections::BTreeMap; | ||
|
||
use regex::Regex; | ||
|
||
lazy_static! { | ||
pub static ref UNIHAN_DATA_ENTRY_REGEX: Regex = Regex::new( | ||
r"(?xm)^ # every line | ||
U\+([[:xdigit:]]{4,6}) # [1]codepoint | ||
\t # separator | ||
(k[a-zA-Z0-9_]+) # [2]field key | ||
\t # separator | ||
(.*) # [3]field value | ||
", | ||
) | ||
.unwrap(); | ||
} | ||
|
||
pub trait DataEntry { | ||
fn new(character: char) -> Self; | ||
fn update<'a>(&mut self, key: &'a str, value: &'a str); | ||
} | ||
|
||
pub fn parse_entries_from_str<T>(str: &str) -> Vec<T> | ||
where | ||
T: DataEntry + Clone, | ||
{ | ||
let mut entry_map: BTreeMap<char, T> = BTreeMap::default(); | ||
|
||
for capture in UNIHAN_DATA_ENTRY_REGEX.captures_iter(str) { | ||
let code_point = u32::from_str_radix(&capture[1], 16).unwrap(); | ||
let chr = char::from_u32(code_point).unwrap(); | ||
|
||
let key = &capture[2]; | ||
let value = &capture[3]; | ||
|
||
match entry_map.get(&chr) { | ||
None => { | ||
let mut entry = T::new(chr); | ||
entry.update(key, value); | ||
entry_map.insert(chr, entry); | ||
} | ||
Some(_) => { | ||
let entry = entry_map.get_mut(&chr).unwrap(); | ||
entry.update(key, value); | ||
} | ||
} | ||
} | ||
|
||
entry_map.values().cloned().collect::<Vec<T>>() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
// Copyright 2017 The UNIC Project Developers. | ||
// | ||
// See the COPYRIGHT file at the top-level directory of this distribution. | ||
// | ||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
// option. This file may not be copied, modified, or distributed | ||
// except according to those terms | ||
|
||
use std::str::FromStr; | ||
|
||
use crate::source::utils::read; | ||
|
||
use super::{parse_entries_from_str, DataEntry}; | ||
|
||
lazy_static! { | ||
/// [Numeric values]: http://www.unicode.org/reports/tr38/#N1024D | ||
pub static ref UNIHAN_NUMERIC_VALUES_DATA: NumericValuesData = { | ||
read("external/unicode/ucd/data/Unihan/Unihan_NumericValues.txt").parse().unwrap() | ||
}; | ||
} | ||
|
||
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] | ||
pub struct NumericValuesDataEntry { | ||
pub character: char, | ||
pub accounting_numeric: Option<u64>, | ||
pub other_numeric: Option<u64>, | ||
pub primary_numeric: Option<u64>, | ||
} | ||
|
||
impl DataEntry for NumericValuesDataEntry { | ||
fn new(character: char) -> NumericValuesDataEntry { | ||
NumericValuesDataEntry { | ||
character: character, | ||
accounting_numeric: None, | ||
other_numeric: None, | ||
primary_numeric: None, | ||
} | ||
} | ||
|
||
fn update<'a>(&mut self, key: &'a str, value: &'a str) { | ||
match key { | ||
"kAccountingNumeric" => self.accounting_numeric = value.parse::<u64>().ok(), | ||
"kOtherNumeric" => self.other_numeric = value.parse::<u64>().ok(), | ||
"kPrimaryNumeric" => self.primary_numeric = value.parse::<u64>().ok(), | ||
_ => {} | ||
} | ||
} | ||
} | ||
|
||
#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] | ||
pub struct NumericValuesData { | ||
pub entries: Box<[NumericValuesDataEntry]>, | ||
} | ||
|
||
impl FromStr for NumericValuesData { | ||
type Err = (); | ||
|
||
fn from_str(str: &str) -> Result<Self, Self::Err> { | ||
Ok(NumericValuesData { | ||
entries: parse_entries_from_str(str).into_boxed_slice(), | ||
}) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use super::super::DataEntry; | ||
use super::{NumericValuesData, NumericValuesDataEntry}; | ||
|
||
#[test] | ||
fn data_entry_parse() { | ||
let mut entry1 = NumericValuesDataEntry::new('\u{3405}'); | ||
entry1.other_numeric = Some(5); | ||
|
||
let mut entry2 = NumericValuesDataEntry::new('\u{4EDF}'); | ||
entry2.accounting_numeric = Some(1000); | ||
|
||
let mut entry3 = NumericValuesDataEntry::new('\u{5146}'); | ||
entry3.primary_numeric = Some(1000000000000); | ||
|
||
let entries = vec![entry1, entry2, entry3]; | ||
|
||
assert_eq!( | ||
"U+3405 kOtherNumeric 5\n\ | ||
U+4EDF kAccountingNumeric 1000\n\ | ||
U+5146 kPrimaryNumeric 1000000000000\n\ | ||
" | ||
.parse(), | ||
Ok(NumericValuesData { | ||
entries: entries.into_boxed_slice(), | ||
}), | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
// Copyright 2017 The UNIC Project Developers. | ||
// | ||
// See the COPYRIGHT file at the top-level directory of this distribution. | ||
// | ||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
// option. This file may not be copied, modified, or distributed | ||
// except according to those terms | ||
|
||
use std::str::FromStr; | ||
|
||
use crate::source::utils::read; | ||
|
||
use super::{parse_entries_from_str, DataEntry}; | ||
|
||
lazy_static! { | ||
/// [Readings]: http://www.unicode.org/reports/tr38/#N1019C | ||
pub static ref UNIHAN_READINGS_DATA: ReadingsData = { | ||
read("external/unicode/ucd/data/Unihan/Unihan_Readings.txt").parse().unwrap() | ||
}; | ||
} | ||
|
||
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] | ||
pub struct ReadingsDataEntry { | ||
pub character: char, | ||
pub cantonese: Option<String>, | ||
pub definition: Option<String>, | ||
pub hangul: Option<String>, | ||
pub hanyu_pinlu: Option<String>, | ||
pub hanyu_pinyin: Option<String>, | ||
pub japanese_kun: Option<String>, | ||
pub japanese_on: Option<String>, | ||
pub korean: Option<String>, | ||
pub mandarin: Option<String>, | ||
pub tang: Option<String>, | ||
pub vietnamese: Option<String>, | ||
pub xhc_1983: Option<String>, | ||
} | ||
|
||
impl DataEntry for ReadingsDataEntry { | ||
fn new(character: char) -> ReadingsDataEntry { | ||
ReadingsDataEntry { | ||
character: character, | ||
cantonese: None, | ||
definition: None, | ||
hangul: None, | ||
hanyu_pinlu: None, | ||
hanyu_pinyin: None, | ||
japanese_kun: None, | ||
japanese_on: None, | ||
korean: None, | ||
mandarin: None, | ||
tang: None, | ||
vietnamese: None, | ||
xhc_1983: None, | ||
} | ||
} | ||
|
||
fn update<'a>(&mut self, key: &'a str, value: &'a str) { | ||
match key { | ||
"kCantonese" => self.cantonese = Some(value.to_owned()), | ||
"kDefinition" => self.definition = Some(value.to_owned()), | ||
"kHangul" => self.hangul = Some(value.to_owned()), | ||
"kHanyuPinlu" => self.hanyu_pinlu = Some(value.to_owned()), | ||
"kHanyuPinyin" => self.hanyu_pinyin = Some(value.to_owned()), | ||
"kJapaneseKun" => self.japanese_kun = Some(value.to_owned()), | ||
"kJapaneseOn" => self.japanese_on = Some(value.to_owned()), | ||
"kKorean" => self.korean = Some(value.to_owned()), | ||
"kMandarin" => self.mandarin = Some(value.to_owned()), | ||
"kTang" => self.tang = Some(value.to_owned()), | ||
"kVietnamese" => self.vietnamese = Some(value.to_owned()), | ||
"kXHC1983" => self.xhc_1983 = Some(value.to_owned()), | ||
_ => {} | ||
} | ||
} | ||
} | ||
|
||
#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] | ||
pub struct ReadingsData { | ||
pub entries: Box<[ReadingsDataEntry]>, | ||
} | ||
|
||
impl FromStr for ReadingsData { | ||
type Err = (); | ||
|
||
fn from_str(str: &str) -> Result<Self, Self::Err> { | ||
Ok(ReadingsData { | ||
entries: parse_entries_from_str(str).into_boxed_slice(), | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
// Copyright 2017 The UNIC Project Developers. | ||
// | ||
// See the COPYRIGHT file at the top-level directory of this distribution. | ||
// | ||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
// option. This file may not be copied, modified, or distributed | ||
// except according to those terms | ||
|
||
use std::char; | ||
use std::str::FromStr; | ||
|
||
use regex::Regex; | ||
|
||
use crate::source::utils::read; | ||
|
||
use super::{parse_entries_from_str, DataEntry}; | ||
|
||
lazy_static! { | ||
/// [Variants]: http://www.unicode.org/reports/tr38/#N10211 | ||
pub static ref UNIHAN_VARIANTS_DATA: VariantsData = { | ||
read("external/unicode/ucd/data/Unihan/Unihan_Variants.txt").parse().unwrap() | ||
}; | ||
|
||
pub static ref VALUE_REGEX: Regex = Regex::new( | ||
r"(?x) # extended regex syntax | ||
U\+(2?[[:xdigit:]]{4}) # [1]codepoint | ||
<?( # [2]additional data | ||
k[[:alnum:]]+(:[TBZFJ]+)?(,k[[:alnum:]]+(:[TBZFJ]+)?)* | ||
)? | ||
", | ||
).unwrap(); | ||
} | ||
|
||
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] | ||
pub struct VariantsDataEntry { | ||
pub character: char, | ||
pub semantic_variants: Option<Vec<char>>, // FIXME: handle additional data | ||
pub simplified_variant: Option<char>, | ||
pub specialized_semantic_variants: Option<Vec<char>>, // FIXME: handle additional data | ||
pub traditional_variant: Option<char>, | ||
pub z_variants: Option<Vec<char>>, // FIXME: handle additional data | ||
} | ||
|
||
impl VariantsDataEntry { | ||
pub fn parse_value<'a>(str: &'a str) -> char { | ||
let capture = VALUE_REGEX.captures(str).unwrap(); | ||
let code_point = u32::from_str_radix(&capture[1], 16).unwrap(); | ||
char::from_u32(code_point).unwrap() | ||
} | ||
|
||
pub fn parse_values_with_additional_data<'a>(str: &'a str) -> Vec<char> { | ||
let mut chars = vec![]; | ||
for capture in VALUE_REGEX.captures_iter(str) { | ||
let code_point = u32::from_str_radix(&capture[1], 16).unwrap(); | ||
let chr = char::from_u32(code_point).unwrap(); | ||
chars.push(chr); | ||
} | ||
chars | ||
} | ||
} | ||
|
||
impl DataEntry for VariantsDataEntry { | ||
fn new(character: char) -> VariantsDataEntry { | ||
VariantsDataEntry { | ||
character: character, | ||
semantic_variants: None, | ||
simplified_variant: None, | ||
specialized_semantic_variants: None, | ||
traditional_variant: None, | ||
z_variants: None, | ||
} | ||
} | ||
|
||
fn update<'a>(&mut self, key: &'a str, value: &'a str) { | ||
match key { | ||
"kSemanticVariant" => { | ||
self.semantic_variants = | ||
Some(VariantsDataEntry::parse_values_with_additional_data(value)) | ||
} | ||
"kSimplifiedVariant" => { | ||
self.simplified_variant = Some(VariantsDataEntry::parse_value(value)) | ||
} | ||
"kSpecializedSemanticVariant" => { | ||
self.specialized_semantic_variants = | ||
Some(VariantsDataEntry::parse_values_with_additional_data(value)) | ||
} | ||
"kTraditionalVariant" => { | ||
self.traditional_variant = Some(VariantsDataEntry::parse_value(value)) | ||
} | ||
"kZVariant" => { | ||
self.z_variants = Some(VariantsDataEntry::parse_values_with_additional_data(value)) | ||
} | ||
_ => {} | ||
} | ||
} | ||
} | ||
|
||
#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] | ||
pub struct VariantsData { | ||
pub entries: Box<[VariantsDataEntry]>, | ||
} | ||
|
||
impl FromStr for VariantsData { | ||
type Err = (); | ||
|
||
fn from_str(str: &str) -> Result<Self, Self::Err> { | ||
Ok(VariantsData { | ||
entries: parse_entries_from_str(str).into_boxed_slice(), | ||
}) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use super::VariantsDataEntry; | ||
|
||
#[test] | ||
fn value_parse() { | ||
let sample_value = "U+54A8"; | ||
let chr = VariantsDataEntry::parse_value(sample_value); | ||
assert_eq!(chr, '\u{54A8}') | ||
} | ||
|
||
#[test] | ||
fn value_parse_with_additional_data() { | ||
let sample_value = "U+54A8<kMatthews:T,kMeyerWempe U+8AEE<kMatthews,kMeyerWempe"; | ||
let chars = VariantsDataEntry::parse_values_with_additional_data(sample_value); | ||
assert_eq!(chars, vec!['\u{54A8}', '\u{8AEE}']) | ||
} | ||
} |
Oops, something went wrong.