diff --git a/parquet/metadata/statistics_test.go b/parquet/metadata/statistics_test.go index d312437a..a0d107fa 100644 --- a/parquet/metadata/statistics_test.go +++ b/parquet/metadata/statistics_test.go @@ -603,3 +603,40 @@ func TestBooleanStatisticsUpdateFromBitmapSpaced(t *testing.T) { assert.Equal(t, int64(8), stats.NullCount()) }) } + +func TestNewStatisticsDistinctCountUnset(t *testing.T) { + // Issue #806: factories must not pre-set hasDistinctCount, otherwise + // Encode() emits distinct_count=0 for every column instead of leaving the + // optional Thrift field absent. IncDistinct() is responsible for marking + // it set when an actual distinct count is computed. + mk := func(n *schema.PrimitiveNode) metadata.TypedStatistics { + col := schema.NewColumn(n, 0, 0) + return metadata.NewStatistics(col, memory.DefaultAllocator) + } + cases := []struct { + name string + s metadata.TypedStatistics + }{ + {"Int32", mk(schema.NewInt32Node("i32", parquet.Repetitions.Required, -1))}, + {"Int64", mk(schema.NewInt64Node("i64", parquet.Repetitions.Required, -1))}, + {"Float32", mk(schema.NewFloat32Node("f32", parquet.Repetitions.Required, -1))}, + {"Float64", mk(schema.NewFloat64Node("f64", parquet.Repetitions.Required, -1))}, + {"Boolean", mk(schema.NewBooleanNode("b", parquet.Repetitions.Required, -1))}, + {"ByteArray", mk(schema.NewByteArrayNode("ba", parquet.Repetitions.Required, -1))}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + assert.False(t, c.s.HasDistinctCount(), "fresh stats must not advertise a distinct count") + enc, err := c.s.Encode() + require.NoError(t, err) + assert.False(t, enc.HasDistinctCount, "encoded stats must leave distinct_count unset until IncDistinct is called") + + c.s.IncDistinct(3) + assert.True(t, c.s.HasDistinctCount()) + enc, err = c.s.Encode() + require.NoError(t, err) + assert.True(t, enc.HasDistinctCount) + assert.Equal(t, int64(3), enc.DistinctCount) + }) + } +} diff --git a/parquet/metadata/statistics_types.gen.go b/parquet/metadata/statistics_types.gen.go index a00010f9..a9a19ea1 100644 --- a/parquet/metadata/statistics_types.gen.go +++ b/parquet/metadata/statistics_types.gen.go @@ -57,12 +57,11 @@ func NewInt32Statistics(descr *schema.Column, mem memory.Allocator) *Int32Statis return &Int32Statistics{ statistics: statistics{ - descr: descr, - hasNullCount: true, - hasDistinctCount: true, - order: descr.SortOrder(), - encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), - mem: mem, + descr: descr, + hasNullCount: true, + order: descr.SortOrder(), + encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), + mem: mem, }, } } @@ -358,12 +357,11 @@ func NewInt64Statistics(descr *schema.Column, mem memory.Allocator) *Int64Statis return &Int64Statistics{ statistics: statistics{ - descr: descr, - hasNullCount: true, - hasDistinctCount: true, - order: descr.SortOrder(), - encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), - mem: mem, + descr: descr, + hasNullCount: true, + order: descr.SortOrder(), + encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), + mem: mem, }, } } @@ -659,12 +657,11 @@ func NewInt96Statistics(descr *schema.Column, mem memory.Allocator) *Int96Statis return &Int96Statistics{ statistics: statistics{ - descr: descr, - hasNullCount: true, - hasDistinctCount: true, - order: descr.SortOrder(), - encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), - mem: mem, + descr: descr, + hasNullCount: true, + order: descr.SortOrder(), + encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), + mem: mem, }, } } @@ -938,12 +935,11 @@ func NewFloat32Statistics(descr *schema.Column, mem memory.Allocator) *Float32St return &Float32Statistics{ statistics: statistics{ - descr: descr, - hasNullCount: true, - hasDistinctCount: true, - order: descr.SortOrder(), - encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), - mem: mem, + descr: descr, + hasNullCount: true, + order: descr.SortOrder(), + encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), + mem: mem, }, } } @@ -1231,12 +1227,11 @@ func NewFloat64Statistics(descr *schema.Column, mem memory.Allocator) *Float64St return &Float64Statistics{ statistics: statistics{ - descr: descr, - hasNullCount: true, - hasDistinctCount: true, - order: descr.SortOrder(), - encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), - mem: mem, + descr: descr, + hasNullCount: true, + order: descr.SortOrder(), + encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), + mem: mem, }, } } @@ -1524,12 +1519,11 @@ func NewBooleanStatistics(descr *schema.Column, mem memory.Allocator) *BooleanSt return &BooleanStatistics{ statistics: statistics{ - descr: descr, - hasNullCount: true, - hasDistinctCount: true, - order: descr.SortOrder(), - encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), - mem: mem, + descr: descr, + hasNullCount: true, + order: descr.SortOrder(), + encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), + mem: mem, }, } } @@ -1879,12 +1873,11 @@ func NewByteArrayStatistics(descr *schema.Column, mem memory.Allocator) *ByteArr return &ByteArrayStatistics{ statistics: statistics{ - descr: descr, - hasNullCount: true, - hasDistinctCount: true, - order: descr.SortOrder(), - encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), - mem: mem, + descr: descr, + hasNullCount: true, + order: descr.SortOrder(), + encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), + mem: mem, }, min: make([]byte, 0), @@ -2187,12 +2180,11 @@ func NewFixedLenByteArrayStatistics(descr *schema.Column, mem memory.Allocator) return &FixedLenByteArrayStatistics{ statistics: statistics{ - descr: descr, - hasNullCount: true, - hasDistinctCount: true, - order: descr.SortOrder(), - encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), - mem: mem, + descr: descr, + hasNullCount: true, + order: descr.SortOrder(), + encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), + mem: mem, }, } } @@ -2502,12 +2494,11 @@ func NewFloat16Statistics(descr *schema.Column, mem memory.Allocator) *Float16St return &Float16Statistics{ statistics: statistics{ - descr: descr, - hasNullCount: true, - hasDistinctCount: true, - order: descr.SortOrder(), - encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), - mem: mem, + descr: descr, + hasNullCount: true, + order: descr.SortOrder(), + encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), + mem: mem, }, } } diff --git a/parquet/metadata/statistics_types.gen.go.tmpl b/parquet/metadata/statistics_types.gen.go.tmpl index 83493998..76657943 100644 --- a/parquet/metadata/statistics_types.gen.go.tmpl +++ b/parquet/metadata/statistics_types.gen.go.tmpl @@ -62,7 +62,6 @@ func New{{.Name}}Statistics(descr *schema.Column, mem memory.Allocator) *{{.Name statistics: statistics{ descr: descr, hasNullCount: true, - hasDistinctCount: true, order: descr.SortOrder(), encoder: encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false, descr, mem), mem: mem, diff --git a/parquet/pqarrow/file_writer_test.go b/parquet/pqarrow/file_writer_test.go index 7b98b212..7ba4b85b 100644 --- a/parquet/pqarrow/file_writer_test.go +++ b/parquet/pqarrow/file_writer_test.go @@ -171,8 +171,8 @@ func TestFileWriterTotalBytes(t *testing.T) { require.NoError(t, writer.Close()) // Verify total bytes & compressed bytes are correct - assert.Equal(t, int64(340), writer.TotalCompressedBytes()) - assert.Equal(t, int64(799), writer.TotalBytesWritten()) + assert.Equal(t, int64(332), writer.TotalCompressedBytes()) + assert.Equal(t, int64(783), writer.TotalBytesWritten()) } func TestFileWriterTotalBytesBuffered(t *testing.T) { @@ -205,8 +205,8 @@ func TestFileWriterTotalBytesBuffered(t *testing.T) { require.NoError(t, writer.Close()) // Verify total bytes & compressed bytes are correct - assert.Equal(t, int64(494), writer.TotalCompressedBytes()) - assert.Equal(t, int64(1139), writer.TotalBytesWritten()) + assert.Equal(t, int64(482), writer.TotalCompressedBytes()) + assert.Equal(t, int64(1115), writer.TotalBytesWritten()) } func TestWriteOnClosedFileWriter(t *testing.T) {