Skip to content

Commit

Permalink
add mapping to concrete subtypes in DataFrameMapper (#343)
Browse files Browse the repository at this point in the history
  • Loading branch information
aloneguid committed Nov 16, 2023
1 parent aea1c45 commit 9dcd02e
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 60 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: 'Build'

env:
VERSION: 4.17.0
VERSION: 4.18.0
ASM_VERSION: 4.0.0
DOC_ARTIFACT: webHelp-all.zip

Expand Down
6 changes: 5 additions & 1 deletion docs/release-history.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@

### Improvements

- Added explicit target for `.NET 8`.
- Added explicit target for `.NET 8` by @aloneguid.

### Bug fixes

- `DataFrameMapper` returns incompatible `DataFrameColumn` by @aloneguid (#343).

## 4.17.0

Expand Down
2 changes: 1 addition & 1 deletion src/Parquet.PerfRunner/Parquet.PerfRunner.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.13.6" />
<PackageReference Include="BenchmarkDotNet" Version="0.13.10" />
<PackageReference Include="ParquetSharp" Version="12.0.1" />
</ItemGroup>

Expand Down
12 changes: 12 additions & 0 deletions src/Parquet.Test/DataAnalysis/DataFrameReaderTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ public class DataFrameReaderTest : TestBase {
[InlineData(typeof(int?), null, 2)]
[InlineData(typeof(bool), true, false)]
[InlineData(typeof(bool?), true, null)]
[InlineData(typeof(long), 1L, 2L)]
[InlineData(typeof(long?), 1L, 2L)]
[InlineData(typeof(ulong), 1UL, 2UL)]
[InlineData(typeof(ulong?), 1UL, 2UL)]
[InlineData(typeof(string), "1", "2")]
[InlineData(typeof(string), null, "2")]
public async Task Roundtrip_all_types(Type t, object el1, object el2) {
Expand Down Expand Up @@ -53,6 +57,14 @@ public async Task Roundtrip_all_types(Type t, object el1, object el2) {
ms1.Position = 0;
DataFrame df1 = await ms1.ReadParquetAsDataFrameAsync();

if(t == typeof(long)) {
// Int64 is a special case in DataFrame
// see https://github.com/aloneguid/parquet-dotnet/issues/343 for more info
df1.Columns.GetInt64Column(t.Name);
} else if (t == typeof(ulong)) {
df1.Columns.GetUInt64Column(t.Name);
}

Assert.Equal(df.Columns.Count, df1.Columns.Count);
for(int i = 0; i < df.Columns.Count; i++) {
Assert.Equal(df.Columns[i], df1.Columns[i]);
Expand Down
10 changes: 5 additions & 5 deletions src/Parquet.Test/Parquet.Test.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@
<ItemGroup>
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
<PackageReference Include="System.ValueTuple" Version="4.5.0" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.6.3" />
<PackageReference Include="xunit" Version="2.5.0" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.5.0">
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.8.0" />
<PackageReference Include="xunit" Version="2.6.1" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.5.3">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
</ItemGroup>

<ItemGroup Condition="'$(TargetFramework)' == 'net7.0'">
<PackageReference Include="System.IO.Pipelines" Version="7.0.0" />
<ItemGroup Condition="('$(TargetFramework)' == 'net7.0') or ('$(TargetFramework)' == 'net8.0')">
<PackageReference Include="System.IO.Pipelines" Version="8.0.0" />
</ItemGroup>

<ItemGroup>
Expand Down
96 changes: 52 additions & 44 deletions src/Parquet/Data/Analysis/DataFrameMapper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,86 +10,107 @@ public static DataFrameColumn ToDataFrameColumn(DataColumn dc) {

if(dc.Field.ClrType == typeof(bool)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<bool>(colName, (bool[])dc.Data);
return new BooleanDataFrameColumn(colName, (bool[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<bool>(colName, (bool?[])dc.Data);
return new BooleanDataFrameColumn(colName, (bool?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(int)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<int>(colName, (int[])dc.Data);
return new Int32DataFrameColumn(colName, (int[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<int>(colName, (int?[])dc.Data);
return new Int32DataFrameColumn(colName, (int?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(uint)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<uint>(colName, (uint[])dc.Data);
return new UInt32DataFrameColumn(colName, (uint[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<uint>(colName, (uint?[])dc.Data);
return new UInt32DataFrameColumn(colName, (uint?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(long)) {
if(dc.Field.ClrType == typeof(double)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<long>(colName, (long[])dc.Data);
return new DoubleDataFrameColumn(colName, (double[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<long>(colName, (long?[])dc.Data);
return new DoubleDataFrameColumn(colName, (double?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(ulong)) {
if(dc.Field.ClrType == typeof(float)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<ulong>(colName, (ulong[])dc.Data);
return new SingleDataFrameColumn(colName, (float[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<ulong>(colName, (ulong?[])dc.Data);
return new SingleDataFrameColumn(colName, (float?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(byte)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<byte>(colName, (byte[])dc.Data);
return new ByteDataFrameColumn(colName, (byte[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<byte>(colName, (byte?[])dc.Data);
return new ByteDataFrameColumn(colName, (byte?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(sbyte)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<sbyte>(colName, (sbyte[])dc.Data);
return new SByteDataFrameColumn(colName, (sbyte[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<sbyte>(colName, (sbyte?[])dc.Data);
return new SByteDataFrameColumn(colName, (sbyte?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(DateTime)) {
if(dc.Field.ClrType == typeof(short)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<DateTime>(colName, (DateTime[])dc.Data);
return new Int16DataFrameColumn(colName, (short[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<DateTime>(colName, (DateTime?[])dc.Data);
return new Int16DataFrameColumn(colName, (short?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(TimeSpan)) {
if(dc.Field.ClrType == typeof(ushort)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<TimeSpan>(colName, (TimeSpan[])dc.Data);
return new UInt16DataFrameColumn(colName, (ushort[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<TimeSpan>(colName, (TimeSpan?[])dc.Data);
return new UInt16DataFrameColumn(colName, (ushort?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(decimal)) {
if(dc.Field.ClrType == typeof(long)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<decimal>(colName, (decimal[])dc.Data);
return new Int64DataFrameColumn(colName, (long[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<decimal>(colName, (decimal?[])dc.Data);
return new Int64DataFrameColumn(colName, (long?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(float)) {
if(dc.Field.ClrType == typeof(ulong)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<float>(colName, (float[])dc.Data);
return new UInt64DataFrameColumn(colName, (ulong[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<float>(colName, (float?[])dc.Data);
return new UInt64DataFrameColumn(colName, (ulong?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(double)) {
if(dc.Field.ClrType == typeof(string)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new StringDataFrameColumn(colName, (string[])dc.Data);
} else {
return new StringDataFrameColumn(colName, (string?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(DateTime)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new DateTimeDataFrameColumn(colName, (DateTime[])dc.Data);
} else {
return new DateTimeDataFrameColumn(colName, (DateTime?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(TimeSpan)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<TimeSpan>(colName, (TimeSpan[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<TimeSpan>(colName, (TimeSpan?[])dc.Data);
}
}
if(dc.Field.ClrType == typeof(decimal)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<double>(colName, (double[])dc.Data);
return new DecimalDataFrameColumn(colName, (decimal[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<double>(colName, (double?[])dc.Data);
return new DecimalDataFrameColumn(colName, (decimal?[])dc.Data);
}
}
// special case
Expand Down Expand Up @@ -271,55 +292,42 @@ public static void AppendValues(DataFrameColumn dfc, DataColumn dc) {
}

public static Array GetTypedDataFast(DataFrameColumn col) {

if(col.DataType == typeof(bool)) {
return ((PrimitiveDataFrameColumn<bool>)col).ToArray();
}

if(col.DataType == typeof(int)) {
return ((PrimitiveDataFrameColumn<int>)col).ToArray();
}

if(col.DataType == typeof(uint)) {
return ((PrimitiveDataFrameColumn<uint>)col).ToArray();
}

if(col.DataType == typeof(long)) {
return ((PrimitiveDataFrameColumn<long>)col).ToArray();
}

if(col.DataType == typeof(ulong)) {
return ((PrimitiveDataFrameColumn<ulong>)col).ToArray();
}

if(col.DataType == typeof(byte)) {
return ((PrimitiveDataFrameColumn<byte>)col).ToArray();
}

if(col.DataType == typeof(sbyte)) {
return ((PrimitiveDataFrameColumn<sbyte>)col).ToArray();
}

if(col.DataType == typeof(DateTime)) {
return ((PrimitiveDataFrameColumn<DateTime>)col).ToArray();
}

if(col.DataType == typeof(TimeSpan)) {
return ((PrimitiveDataFrameColumn<TimeSpan>)col).ToArray();
}

if(col.DataType == typeof(decimal)) {
return ((PrimitiveDataFrameColumn<decimal>)col).ToArray();
}

if(col.DataType == typeof(float)) {
return ((PrimitiveDataFrameColumn<float>)col).ToArray();
}

if(col.DataType == typeof(double)) {
return ((PrimitiveDataFrameColumn<double>)col).ToArray();
}

// special case
if(col.DataType == typeof(string)) {
return ((StringDataFrameColumn)col).ToArray();
Expand Down
31 changes: 24 additions & 7 deletions src/Parquet/Data/Analysis/DataFrameMapper.tt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,23 @@
"decimal",
"float",
"double" };
var clrToColumnType = new Dictionary<string, string> {
{ "bool", "BooleanDataFrameColumn" },
{ "int", "Int32DataFrameColumn" },
{ "uint", "UInt32DataFrameColumn" },
{ "double", "DoubleDataFrameColumn" },
{ "float", "SingleDataFrameColumn" },
{ "byte", "ByteDataFrameColumn" },
{ "sbyte", "SByteDataFrameColumn" },
{ "short", "Int16DataFrameColumn" },
{ "ushort", "UInt16DataFrameColumn" },
{ "long", "Int64DataFrameColumn" },
{ "ulong", "UInt64DataFrameColumn" },
{ "string", "StringDataFrameColumn" },
{ "DateTime", "DateTimeDataFrameColumn" },
{ "TimeSpan", "PrimitiveDataFrameColumn<TimeSpan>" },
{ "decimal", "DecimalDataFrameColumn" }
};
#>using System;
using System.Linq;
using System.Numerics;
Expand All @@ -25,11 +42,11 @@ namespace Parquet.Data.Analysis {
public static DataFrameColumn ToDataFrameColumn(DataColumn dc) {
string colName = string.Join("_", dc.Field.Path.ToList());

<# foreach(var t in valueTypes) { #>if(dc.Field.ClrType == typeof(<#= t #>)) {
<# foreach(var t in clrToColumnType) { #>if(dc.Field.ClrType == typeof(<#= t.Key #>)) {
if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) {
return new PrimitiveDataFrameColumn<<#= t #>>(colName, (<#= t #>[])dc.Data);
return new <#= t.Value #>(colName, (<#= t.Key #>[])dc.Data);
} else {
return new PrimitiveDataFrameColumn<<#= t #>>(colName, (<#= t #>?[])dc.Data);
return new <#= t.Value #>(colName, (<#= t.Key #>?[])dc.Data);
}
}
<# } #>
Expand Down Expand Up @@ -70,12 +87,12 @@ namespace Parquet.Data.Analysis {
}

public static Array GetTypedDataFast(DataFrameColumn col) {
<# foreach(var t in valueTypes) { #>
if(col.DataType == typeof(<#= t #>)) {
<# foreach(var t in valueTypes) {
#>if(col.DataType == typeof(<#= t #>)) {
return ((PrimitiveDataFrameColumn<<#= t #>>)col).ToArray();
}
<# } #>
// special case
<# }
#>// special case
if(col.DataType == typeof(string)) {
return ((StringDataFrameColumn)col).ToArray();
}
Expand Down
2 changes: 1 addition & 1 deletion src/Parquet/Parquet.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
<ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
<PackageReference Include="System.Reflection.Emit.Lightweight" Version="4.7.0" />
<PackageReference Include="System.Threading.Tasks.Extensions" Version="4.5.4" />
<PackageReference Include="System.Text.Json" Version="7.0.3" />
<PackageReference Include="System.Text.Json" Version="8.0.0" />
</ItemGroup>

<ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.1'">
Expand Down

0 comments on commit 9dcd02e

Please sign in to comment.