Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

performance(stdlib,value): Faster JSON parsing #1249

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ default = ["compiler", "value", "diagnostic", "path", "parser", "stdlib", "datad

# Main features (on by default)
compiler = ["diagnostic", "path", "parser", "value", "dep:paste", "dep:chrono", "dep:serde", "dep:regex", "dep:bytes", "dep:ordered-float", "dep:chrono-tz", "dep:snafu", "dep:thiserror", "dep:dyn-clone", "dep:indoc", "dep:thiserror", "dep:lalrpop-util"]
value = ["path", "dep:bytes", "dep:regex", "dep:ordered-float", "dep:chrono", "dep:serde_json"]
value = ["path", "dep:bytes", "dep:regex", "dep:ordered-float", "dep:chrono", "dep:serde_json", "dep:simdutf8"]
diagnostic = ["dep:codespan-reporting", "dep:termcolor"]
path = ["value", "dep:once_cell", "dep:serde", "dep:snafu", "dep:regex"]
parser = ["path", "diagnostic", "value", "dep:thiserror", "dep:ordered-float", "dep:lalrpop-util"]
Expand Down Expand Up @@ -182,6 +182,7 @@ rust_decimal = { version = "1", optional = true }
seahash = { version = "4", optional = true }
serde = { version = "1", features = ["derive"], optional = true }
serde_json = { version = "1", default-features = false, optional = true, features = ["std", "raw_value"] }
simdutf8 = { version = "0.1.5", optional = true }
fancy-regex = { version = "0.14.0", default-features = false, optional = true }
sha-1 = { version = "0.10", optional = true }
sha-2 = { package = "sha2", version = "0.10", optional = true }
Expand Down
6 changes: 6 additions & 0 deletions changelog.d/1249.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Faster converting bytes to Unicode string by using SIMD instructions provided by simdutf8 crate.
simdutf8 is up to 23 times faster than the std library on valid non-ASCII, up to four times on pure
ASCII is the same method provided by Rust's standard library. This will speed up almost all VRL methods
like `parse_json` or `parse_regex`.

authors: JakubOnderka
11 changes: 4 additions & 7 deletions src/compiler/value/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,10 @@ impl VrlValueConvert for Value {
}

fn try_bytes_utf8_lossy(&self) -> Result<Cow<'_, str>, ValueError> {
match self.as_bytes() {
Some(bytes) => Ok(String::from_utf8_lossy(bytes)),
None => Err(ValueError::Expected {
got: self.kind(),
expected: Kind::bytes(),
}),
}
self.as_str().ok_or(ValueError::Expected {
got: self.kind(),
expected: Kind::bytes(),
})
}

fn try_boolean(self) -> Result<bool, ValueError> {
Expand Down
4 changes: 2 additions & 2 deletions src/datadog/grok/matchers/date.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,9 @@ pub fn time_format_to_regex(format: &str, with_captures: bool) -> Result<RegexRe
}

pub fn apply_date_filter(value: &Value, filter: &DateFilter) -> Result<Value, InternalError> {
let original_value = String::from_utf8_lossy(value.as_bytes().ok_or_else(|| {
let original_value = value.as_str().ok_or_else(|| {
InternalError::FailedToApplyFilter(filter.to_string(), value.to_string())
})?);
})?;
let (strp_format, mut datetime) =
adjust_strp_format_and_value(&filter.strp_format, &original_value);

Expand Down
3 changes: 1 addition & 2 deletions src/stdlib/format_timestamp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ use chrono::{
fn format_timestamp_with_tz(ts: Value, format: Value, timezone: Option<Value>) -> Resolved {
let ts: DateTime<Utc> = ts.try_timestamp()?;

let format_bytes = format.try_bytes()?;
let format = String::from_utf8_lossy(&format_bytes);
let format = format.try_bytes_utf8_lossy()?;

let timezone_bytes = timezone.map(VrlValueConvert::try_bytes).transpose()?;
let timezone = timezone_bytes.as_ref().map(|b| String::from_utf8_lossy(b));
Expand Down
3 changes: 1 addition & 2 deletions src/stdlib/parse_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ fn parse_bytes(bytes: Value, unit: Value, base: &Bytes) -> Resolved {
b"10" => (&*DEC_UNITS, Config::new().with_decimal()),
_ => unreachable!("enum invariant"),
};
let bytes = bytes.try_bytes()?;
let value = String::from_utf8_lossy(&bytes);
let value = bytes.try_bytes_utf8_lossy()?;
let value: &str = value.as_ref();
let conversion_factor = {
let bytes = unit.try_bytes()?;
Expand Down
6 changes: 2 additions & 4 deletions src/stdlib/parse_duration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@ use rust_decimal::{prelude::ToPrimitive, Decimal};
use std::{collections::HashMap, str::FromStr};

fn parse_duration(bytes: Value, unit: Value) -> Resolved {
let bytes = bytes.try_bytes()?;
let value = String::from_utf8_lossy(&bytes);
let value = bytes.try_bytes_utf8_lossy()?;
let mut value = &value[..];
let conversion_factor = {
let bytes = unit.try_bytes()?;
let string = String::from_utf8_lossy(&bytes);
let string = unit.try_bytes_utf8_lossy()?;

UNITS
.get(string.as_ref())
Expand Down
12 changes: 5 additions & 7 deletions src/stdlib/parse_json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,12 @@ use crate::stdlib::json_utils::json_type_def::json_type_def;

fn parse_json(value: Value, lossy: Option<Value>) -> Resolved {
let lossy = lossy.map(Value::try_boolean).transpose()?.unwrap_or(true);
let bytes = if lossy {
value.try_bytes_utf8_lossy()?.into_owned().into()
Ok(if lossy {
serde_json::from_str(&value.try_bytes_utf8_lossy()?)
} else {
value.try_bytes()?
};
let value = serde_json::from_slice::<'_, Value>(&bytes)
.map_err(|e| format!("unable to parse json: {e}"))?;
Ok(value)
serde_json::from_slice(&value.try_bytes()?)
}
.map_err(|e| format!("unable to parse json: {e}"))?)
}

// parse_json_with_depth method recursively traverses the value and returns raw JSON-formatted bytes
Expand Down
7 changes: 3 additions & 4 deletions src/stdlib/parse_regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@ use regex::Regex;

use super::util;

fn parse_regex(value: Value, numeric_groups: bool, pattern: &Regex) -> Resolved {
let bytes = value.try_bytes()?;
let value = String::from_utf8_lossy(&bytes);
fn parse_regex(value: &Value, numeric_groups: bool, pattern: &Regex) -> Resolved {
let value = value.try_bytes_utf8_lossy()?;
let parsed = pattern
.captures(&value)
.map(|capture| util::capture_regex_to_map(pattern, &capture, numeric_groups))
Expand Down Expand Up @@ -109,7 +108,7 @@ impl FunctionExpression for ParseRegexFn {
let numeric_groups = self.numeric_groups.resolve(ctx)?;
let pattern = &self.pattern;

parse_regex(value, numeric_groups.try_boolean()?, pattern)
parse_regex(&value, numeric_groups.try_boolean()?, pattern)
}

fn type_def(&self, _: &state::TypeState) -> TypeDef {
Expand Down
3 changes: 1 addition & 2 deletions src/stdlib/parse_regex_all.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ use crate::compiler::prelude::*;
use super::util;

fn parse_regex_all(value: Value, numeric_groups: bool, pattern: &Regex) -> Resolved {
let bytes = value.try_bytes()?;
let value = String::from_utf8_lossy(&bytes);
let value = value.try_bytes_utf8_lossy()?;
Ok(pattern
.captures_iter(&value)
.map(|capture| util::capture_regex_to_map(pattern, &capture, numeric_groups).into())
Expand Down
8 changes: 4 additions & 4 deletions src/stdlib/strlen.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use crate::compiler::prelude::*;

fn strlen(value: Value) -> Resolved {
let v = value.try_bytes()?;
fn strlen(value: &Value) -> Resolved {
let v = value.try_bytes_utf8_lossy()?;

Ok(String::from_utf8_lossy(&v).chars().count().into())
Ok(v.chars().count().into())
}

#[derive(Clone, Copy, Debug)]
Expand Down Expand Up @@ -51,7 +51,7 @@ impl FunctionExpression for StrlenFn {
fn resolve(&self, ctx: &mut Context) -> Resolved {
let value = self.value.resolve(ctx)?;

strlen(value)
strlen(&value)
}

fn type_def(&self, _state: &state::TypeState) -> TypeDef {
Expand Down
24 changes: 22 additions & 2 deletions src/value/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

use bytes::{Bytes, BytesMut};
use chrono::{DateTime, SecondsFormat, Utc};
pub use iter::{IterItem, ValueIter};
use ordered_float::NotNan;
use std::borrow::Cow;
use std::{cmp::Ordering, collections::BTreeMap};

pub use iter::{IterItem, ValueIter};

pub use super::value::regex::ValueRegex;
use super::KeyString;
use crate::path::ValuePath;
Expand Down Expand Up @@ -191,6 +191,26 @@ impl PartialOrd for Value {
}
}

/// Converts a slice of bytes to a string, including invalid characters.
#[must_use]
pub fn simdutf_bytes_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
simdutf8::basic::from_utf8(v).map_or_else(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One question that just came up, did you consider encoding_rs?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

encoding_rs provides SIMD accelerated decoding just for nightly Rust. See https://github.com/hsivonen/encoding_rs?tab=readme-ov-file#simd-accel

|_| {
const REPLACEMENT: &str = "\u{FFFD}";

let mut res = String::with_capacity(v.len());
for chunk in v.utf8_chunks() {
res.push_str(chunk.valid());
if !chunk.invalid().is_empty() {
res.push_str(REPLACEMENT);
}
}
Cow::Owned(res)
},
Cow::Borrowed,
)
}

/// Converts a timestamp to a `String`.
#[must_use]
pub fn timestamp_to_string(timestamp: &DateTime<Utc>) -> String {
Expand Down
6 changes: 3 additions & 3 deletions src/value/value/convert.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
use std::borrow::Cow;
use std::sync::Arc;

use crate::value::value::regex::ValueRegex;
use crate::value::value::simdutf_bytes_utf8_lossy;
use bytes::Bytes;
use chrono::{DateTime, Utc};
use ordered_float::NotNan;
use regex::Regex;

use crate::value::value::regex::ValueRegex;

use super::super::{KeyString, Kind, ObjectMap, Value};

impl Value {
Expand Down Expand Up @@ -131,7 +131,7 @@ impl Value {
/// Returns self as `Cow<str>`, only if self is `Value::Bytes`
pub fn as_str(&self) -> Option<Cow<'_, str>> {
self.as_bytes()
.map(|bytes| String::from_utf8_lossy(bytes.as_ref()))
.map(|bytes| simdutf_bytes_utf8_lossy(bytes.as_ref()))
}

/// Converts the Value into a byte representation regardless of its original type.
Expand Down
9 changes: 4 additions & 5 deletions src/value/value/serde.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
use std::{borrow::Cow, collections::BTreeMap, fmt};

use crate::value::value::{simdutf_bytes_utf8_lossy, timestamp_to_string, StdError, Value};
use bytes::Bytes;
use ordered_float::NotNan;
use serde::de::Error as SerdeError;
use serde::de::{MapAccess, SeqAccess, Visitor};
use serde::{Deserialize, Serialize, Serializer};

use crate::value::value::{timestamp_to_string, StdError, Value};

impl Value {
/// Converts self into a `Bytes`, using JSON for Map/Array.
///
Expand Down Expand Up @@ -37,7 +36,7 @@ impl Value {
/// If map or array serialization fails.
pub fn to_string_lossy(&self) -> Cow<'_, str> {
match self {
Self::Bytes(bytes) => String::from_utf8_lossy(bytes),
Self::Bytes(bytes) => simdutf_bytes_utf8_lossy(bytes),
Self::Regex(regex) => regex.as_str().into(),
Self::Timestamp(timestamp) => timestamp_to_string(timestamp).into(),
Self::Integer(num) => num.to_string().into(),
Expand All @@ -63,7 +62,7 @@ impl Serialize for Value {
Self::Integer(i) => serializer.serialize_i64(*i),
Self::Float(f) => serializer.serialize_f64(f.into_inner()),
Self::Boolean(b) => serializer.serialize_bool(*b),
Self::Bytes(b) => serializer.serialize_str(String::from_utf8_lossy(b).as_ref()),
Self::Bytes(b) => serializer.serialize_str(simdutf_bytes_utf8_lossy(b).as_ref()),
Self::Timestamp(ts) => serializer.serialize_str(&timestamp_to_string(ts)),
Self::Regex(regex) => serializer.serialize_str(regex.as_str()),
Self::Object(m) => serializer.collect_map(m),
Expand Down Expand Up @@ -227,7 +226,7 @@ impl TryInto<serde_json::Value> for Value {
Self::Boolean(v) => Ok(serde_json::Value::from(v)),
Self::Integer(v) => Ok(serde_json::Value::from(v)),
Self::Float(v) => Ok(serde_json::Value::from(v.into_inner())),
Self::Bytes(v) => Ok(serde_json::Value::from(String::from_utf8(v.to_vec())?)),
Self::Bytes(v) => Ok(serde_json::Value::from(simdutf8::compat::from_utf8(&v)?)),
Self::Regex(regex) => Ok(serde_json::Value::from(regex.as_str().to_string())),
Self::Object(v) => Ok(serde_json::to_value(v)?),
Self::Array(v) => Ok(serde_json::to_value(v)?),
Expand Down