Skip to content

Commit

Permalink
chore(value): Use simdutf8 for unicode validation
Browse files Browse the repository at this point in the history
  • Loading branch information
JakubOnderka committed Feb 3, 2025
1 parent 1a45333 commit c818790
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 11 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ default = ["compiler", "value", "diagnostic", "path", "parser", "stdlib", "datad

# Main features (on by default)
compiler = ["diagnostic", "path", "parser", "value", "dep:paste", "dep:chrono", "dep:serde", "dep:regex", "dep:bytes", "dep:ordered-float", "dep:chrono-tz", "dep:snafu", "dep:thiserror", "dep:dyn-clone", "dep:indoc", "dep:thiserror", "dep:lalrpop-util"]
value = ["path", "dep:bytes", "dep:regex", "dep:ordered-float", "dep:chrono", "dep:serde_json"]
value = ["path", "dep:bytes", "dep:regex", "dep:ordered-float", "dep:chrono", "dep:serde_json", "dep:simdutf8"]
diagnostic = ["dep:codespan-reporting", "dep:termcolor"]
path = ["value", "dep:once_cell", "dep:serde", "dep:snafu", "dep:regex"]
parser = ["path", "diagnostic", "value", "dep:thiserror", "dep:ordered-float", "dep:lalrpop-util"]
Expand Down Expand Up @@ -182,6 +182,7 @@ rust_decimal = { version = "1", optional = true }
seahash = { version = "4", optional = true }
serde = { version = "1", features = ["derive"], optional = true }
serde_json = { version = "1", default-features = false, optional = true, features = ["std", "raw_value"] }
simdutf8 = { version = "0.1.5", optional = true }
fancy-regex = { version = "0.14.0", default-features = false, optional = true }
sha-1 = { version = "0.10", optional = true }
sha-2 = { package = "sha2", version = "0.10", optional = true }
Expand Down
11 changes: 4 additions & 7 deletions src/compiler/value/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,10 @@ impl VrlValueConvert for Value {
}

fn try_bytes_utf8_lossy(&self) -> Result<Cow<'_, str>, ValueError> {
match self.as_bytes() {
Some(bytes) => Ok(String::from_utf8_lossy(bytes)),
None => Err(ValueError::Expected {
got: self.kind(),
expected: Kind::bytes(),
}),
}
self.as_str().ok_or(ValueError::Expected {
got: self.kind(),
expected: Kind::bytes(),
})
}

fn try_boolean(self) -> Result<bool, ValueError> {
Expand Down
16 changes: 16 additions & 0 deletions src/value/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,28 @@ pub mod value;
mod btreemap;
mod keystring;

use std::borrow::Cow;
pub use kind::Kind;

pub use self::keystring::KeyString;
pub use self::secrets::Secrets;
pub use self::value::{ObjectMap, Value, ValueRegex};

pub(crate) fn simdutf_bytes_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
simdutf8::basic::from_utf8(v).map_or_else(|_| {
const REPLACEMENT: &str = "\u{FFFD}";

let mut res = String::with_capacity(v.len());
for chunk in v.utf8_chunks() {
res.push_str(chunk.valid());
if !chunk.invalid().is_empty() {
res.push_str(REPLACEMENT);
}
}
Cow::Owned(res)
}, Cow::Borrowed)
}

/// A macro to easily generate Values
#[macro_export]
macro_rules! value {
Expand Down
4 changes: 2 additions & 2 deletions src/value/value/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use bytes::Bytes;
use chrono::{DateTime, Utc};
use ordered_float::NotNan;
use regex::Regex;

use crate::value::simdutf_bytes_utf8_lossy;
use crate::value::value::regex::ValueRegex;

use super::super::{KeyString, Kind, ObjectMap, Value};
Expand Down Expand Up @@ -131,7 +131,7 @@ impl Value {
/// Returns self as `Cow<str>`, only if self is `Value::Bytes`
pub fn as_str(&self) -> Option<Cow<'_, str>> {
self.as_bytes()
.map(|bytes| String::from_utf8_lossy(bytes.as_ref()))
.map(|bytes| simdutf_bytes_utf8_lossy(bytes.as_ref()))
}

/// Converts the Value into a byte representation regardless of its original type.
Expand Down
2 changes: 1 addition & 1 deletion src/value/value/serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ impl TryInto<serde_json::Value> for Value {
Self::Boolean(v) => Ok(serde_json::Value::from(v)),
Self::Integer(v) => Ok(serde_json::Value::from(v)),
Self::Float(v) => Ok(serde_json::Value::from(v.into_inner())),
Self::Bytes(v) => Ok(serde_json::Value::from(String::from_utf8(v.to_vec())?)),
Self::Bytes(v) => Ok(serde_json::Value::from(simdutf8::compat::from_utf8(&v)?)),
Self::Regex(regex) => Ok(serde_json::Value::from(regex.as_str().to_string())),
Self::Object(v) => Ok(serde_json::to_value(v)?),
Self::Array(v) => Ok(serde_json::to_value(v)?),
Expand Down

0 comments on commit c818790

Please sign in to comment.