diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b7dce30f..d5e9a33e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,7 +1,7 @@ name: 'Build' env: - VERSION: 4.17.0 + VERSION: 4.18.0 ASM_VERSION: 4.0.0 DOC_ARTIFACT: webHelp-all.zip diff --git a/docs/release-history.md b/docs/release-history.md index e67d278c..d421cba1 100644 --- a/docs/release-history.md +++ b/docs/release-history.md @@ -2,7 +2,11 @@ ### Improvements -- Added explicit target for `.NET 8`. +- Added explicit target for `.NET 8` by @aloneguid. + +### Bug fixes + +- `DataFrameMapper` returns incompatible `DataFrameColumn` by @aloneguid (#343). ## 4.17.0 diff --git a/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj b/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj index 7de97ff5..465c41c7 100644 --- a/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj +++ b/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj @@ -11,7 +11,7 @@ - + diff --git a/src/Parquet.Test/DataAnalysis/DataFrameReaderTest.cs b/src/Parquet.Test/DataAnalysis/DataFrameReaderTest.cs index e0e6f227..ae8529a1 100644 --- a/src/Parquet.Test/DataAnalysis/DataFrameReaderTest.cs +++ b/src/Parquet.Test/DataAnalysis/DataFrameReaderTest.cs @@ -16,6 +16,10 @@ public class DataFrameReaderTest : TestBase { [InlineData(typeof(int?), null, 2)] [InlineData(typeof(bool), true, false)] [InlineData(typeof(bool?), true, null)] + [InlineData(typeof(long), 1L, 2L)] + [InlineData(typeof(long?), 1L, 2L)] + [InlineData(typeof(ulong), 1UL, 2UL)] + [InlineData(typeof(ulong?), 1UL, 2UL)] [InlineData(typeof(string), "1", "2")] [InlineData(typeof(string), null, "2")] public async Task Roundtrip_all_types(Type t, object el1, object el2) { @@ -53,6 +57,14 @@ public async Task Roundtrip_all_types(Type t, object el1, object el2) { ms1.Position = 0; DataFrame df1 = await ms1.ReadParquetAsDataFrameAsync(); + if(t == typeof(long)) { + // Int64 is a special case in DataFrame + // see https://github.com/aloneguid/parquet-dotnet/issues/343 for more info + df1.Columns.GetInt64Column(t.Name); + } else if (t == typeof(ulong)) { + df1.Columns.GetUInt64Column(t.Name); + } + Assert.Equal(df.Columns.Count, df1.Columns.Count); for(int i = 0; i < df.Columns.Count; i++) { Assert.Equal(df.Columns[i], df1.Columns[i]); diff --git a/src/Parquet.Test/Parquet.Test.csproj b/src/Parquet.Test/Parquet.Test.csproj index 48ef1aba..7f5b8266 100644 --- a/src/Parquet.Test/Parquet.Test.csproj +++ b/src/Parquet.Test/Parquet.Test.csproj @@ -19,16 +19,16 @@ - - - + + + runtime; build; native; contentfiles; analyzers; buildtransitive all - - + + diff --git a/src/Parquet/Data/Analysis/DataFrameMapper.cs b/src/Parquet/Data/Analysis/DataFrameMapper.cs index a2ec1dd6..6e92831a 100644 --- a/src/Parquet/Data/Analysis/DataFrameMapper.cs +++ b/src/Parquet/Data/Analysis/DataFrameMapper.cs @@ -10,86 +10,107 @@ public static DataFrameColumn ToDataFrameColumn(DataColumn dc) { if(dc.Field.ClrType == typeof(bool)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (bool[])dc.Data); + return new BooleanDataFrameColumn(colName, (bool[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (bool?[])dc.Data); + return new BooleanDataFrameColumn(colName, (bool?[])dc.Data); } } if(dc.Field.ClrType == typeof(int)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (int[])dc.Data); + return new Int32DataFrameColumn(colName, (int[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (int?[])dc.Data); + return new Int32DataFrameColumn(colName, (int?[])dc.Data); } } if(dc.Field.ClrType == typeof(uint)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (uint[])dc.Data); + return new UInt32DataFrameColumn(colName, (uint[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (uint?[])dc.Data); + return new UInt32DataFrameColumn(colName, (uint?[])dc.Data); } } - if(dc.Field.ClrType == typeof(long)) { + if(dc.Field.ClrType == typeof(double)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (long[])dc.Data); + return new DoubleDataFrameColumn(colName, (double[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (long?[])dc.Data); + return new DoubleDataFrameColumn(colName, (double?[])dc.Data); } } - if(dc.Field.ClrType == typeof(ulong)) { + if(dc.Field.ClrType == typeof(float)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (ulong[])dc.Data); + return new SingleDataFrameColumn(colName, (float[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (ulong?[])dc.Data); + return new SingleDataFrameColumn(colName, (float?[])dc.Data); } } if(dc.Field.ClrType == typeof(byte)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (byte[])dc.Data); + return new ByteDataFrameColumn(colName, (byte[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (byte?[])dc.Data); + return new ByteDataFrameColumn(colName, (byte?[])dc.Data); } } if(dc.Field.ClrType == typeof(sbyte)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (sbyte[])dc.Data); + return new SByteDataFrameColumn(colName, (sbyte[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (sbyte?[])dc.Data); + return new SByteDataFrameColumn(colName, (sbyte?[])dc.Data); } } - if(dc.Field.ClrType == typeof(DateTime)) { + if(dc.Field.ClrType == typeof(short)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (DateTime[])dc.Data); + return new Int16DataFrameColumn(colName, (short[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (DateTime?[])dc.Data); + return new Int16DataFrameColumn(colName, (short?[])dc.Data); } } - if(dc.Field.ClrType == typeof(TimeSpan)) { + if(dc.Field.ClrType == typeof(ushort)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (TimeSpan[])dc.Data); + return new UInt16DataFrameColumn(colName, (ushort[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (TimeSpan?[])dc.Data); + return new UInt16DataFrameColumn(colName, (ushort?[])dc.Data); } } - if(dc.Field.ClrType == typeof(decimal)) { + if(dc.Field.ClrType == typeof(long)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (decimal[])dc.Data); + return new Int64DataFrameColumn(colName, (long[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (decimal?[])dc.Data); + return new Int64DataFrameColumn(colName, (long?[])dc.Data); } } - if(dc.Field.ClrType == typeof(float)) { + if(dc.Field.ClrType == typeof(ulong)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (float[])dc.Data); + return new UInt64DataFrameColumn(colName, (ulong[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (float?[])dc.Data); + return new UInt64DataFrameColumn(colName, (ulong?[])dc.Data); } } - if(dc.Field.ClrType == typeof(double)) { + if(dc.Field.ClrType == typeof(string)) { + if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { + return new StringDataFrameColumn(colName, (string[])dc.Data); + } else { + return new StringDataFrameColumn(colName, (string?[])dc.Data); + } + } + if(dc.Field.ClrType == typeof(DateTime)) { + if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { + return new DateTimeDataFrameColumn(colName, (DateTime[])dc.Data); + } else { + return new DateTimeDataFrameColumn(colName, (DateTime?[])dc.Data); + } + } + if(dc.Field.ClrType == typeof(TimeSpan)) { + if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { + return new PrimitiveDataFrameColumn(colName, (TimeSpan[])dc.Data); + } else { + return new PrimitiveDataFrameColumn(colName, (TimeSpan?[])dc.Data); + } + } + if(dc.Field.ClrType == typeof(decimal)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn(colName, (double[])dc.Data); + return new DecimalDataFrameColumn(colName, (decimal[])dc.Data); } else { - return new PrimitiveDataFrameColumn(colName, (double?[])dc.Data); + return new DecimalDataFrameColumn(colName, (decimal?[])dc.Data); } } // special case @@ -271,55 +292,42 @@ public static void AppendValues(DataFrameColumn dfc, DataColumn dc) { } public static Array GetTypedDataFast(DataFrameColumn col) { - if(col.DataType == typeof(bool)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - if(col.DataType == typeof(int)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - if(col.DataType == typeof(uint)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - if(col.DataType == typeof(long)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - if(col.DataType == typeof(ulong)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - if(col.DataType == typeof(byte)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - if(col.DataType == typeof(sbyte)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - if(col.DataType == typeof(DateTime)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - if(col.DataType == typeof(TimeSpan)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - if(col.DataType == typeof(decimal)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - if(col.DataType == typeof(float)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - if(col.DataType == typeof(double)) { return ((PrimitiveDataFrameColumn)col).ToArray(); } - // special case if(col.DataType == typeof(string)) { return ((StringDataFrameColumn)col).ToArray(); diff --git a/src/Parquet/Data/Analysis/DataFrameMapper.tt b/src/Parquet/Data/Analysis/DataFrameMapper.tt index ee6426cf..71e94cce 100644 --- a/src/Parquet/Data/Analysis/DataFrameMapper.tt +++ b/src/Parquet/Data/Analysis/DataFrameMapper.tt @@ -15,6 +15,23 @@ "decimal", "float", "double" }; + var clrToColumnType = new Dictionary { + { "bool", "BooleanDataFrameColumn" }, + { "int", "Int32DataFrameColumn" }, + { "uint", "UInt32DataFrameColumn" }, + { "double", "DoubleDataFrameColumn" }, + { "float", "SingleDataFrameColumn" }, + { "byte", "ByteDataFrameColumn" }, + { "sbyte", "SByteDataFrameColumn" }, + { "short", "Int16DataFrameColumn" }, + { "ushort", "UInt16DataFrameColumn" }, + { "long", "Int64DataFrameColumn" }, + { "ulong", "UInt64DataFrameColumn" }, + { "string", "StringDataFrameColumn" }, + { "DateTime", "DateTimeDataFrameColumn" }, + { "TimeSpan", "PrimitiveDataFrameColumn" }, + { "decimal", "DecimalDataFrameColumn" } + }; #>using System; using System.Linq; using System.Numerics; @@ -25,11 +42,11 @@ namespace Parquet.Data.Analysis { public static DataFrameColumn ToDataFrameColumn(DataColumn dc) { string colName = string.Join("_", dc.Field.Path.ToList()); - <# foreach(var t in valueTypes) { #>if(dc.Field.ClrType == typeof(<#= t #>)) { + <# foreach(var t in clrToColumnType) { #>if(dc.Field.ClrType == typeof(<#= t.Key #>)) { if(dc.Field.ClrType == dc.Field.ClrNullableIfHasNullsType) { - return new PrimitiveDataFrameColumn<<#= t #>>(colName, (<#= t #>[])dc.Data); + return new <#= t.Value #>(colName, (<#= t.Key #>[])dc.Data); } else { - return new PrimitiveDataFrameColumn<<#= t #>>(colName, (<#= t #>?[])dc.Data); + return new <#= t.Value #>(colName, (<#= t.Key #>?[])dc.Data); } } <# } #> @@ -70,12 +87,12 @@ namespace Parquet.Data.Analysis { } public static Array GetTypedDataFast(DataFrameColumn col) { - <# foreach(var t in valueTypes) { #> - if(col.DataType == typeof(<#= t #>)) { + <# foreach(var t in valueTypes) { + #>if(col.DataType == typeof(<#= t #>)) { return ((PrimitiveDataFrameColumn<<#= t #>>)col).ToArray(); } - <# } #> - // special case + <# } + #>// special case if(col.DataType == typeof(string)) { return ((StringDataFrameColumn)col).ToArray(); } diff --git a/src/Parquet/Parquet.csproj b/src/Parquet/Parquet.csproj index 32fd1d0a..36011800 100644 --- a/src/Parquet/Parquet.csproj +++ b/src/Parquet/Parquet.csproj @@ -58,7 +58,7 @@ - +