From 6d5ca9e09eab1869d6ac8e7438afdaf7f10fd4e8 Mon Sep 17 00:00:00 2001 From: per Date: Fri, 1 May 2026 15:22:50 +0200 Subject: [PATCH 1/2] Implement matrix-avro v0.3.0 --- docs/cookbook/matrix-avro.md | 24 ++- docs/tutorial/11b-matrix-avro.md | 11 +- matrix-avro/README.md | 24 ++- matrix-avro/build.gradle | 2 +- matrix-avro/release.md | 17 ++ matrix-avro/req/v0.3.0.md | 118 ++++++----- .../alipsa/matrix/avro/AvroReadOptions.groovy | 17 ++ .../alipsa/matrix/avro/AvroSchemaDecl.groovy | 46 +++++ .../matrix/avro/AvroWriteOptions.groovy | 16 ++ .../matrix/avro/MatrixAvroReader.groovy | 184 +++++++++++++++++- .../matrix/avro/MatrixAvroWriter.groovy | 115 ++++++----- .../alipsa/matrix/avro/SchemaCacheKey.groovy | 59 ------ .../matrix/avro/MatrixAvroReaderTest.groovy | 102 +++++++++- .../matrix/avro/MatrixAvroWriterTest.groovy | 132 ++++++++++++- matrix-bom/bom.xml | 2 +- 15 files changed, 668 insertions(+), 201 deletions(-) delete mode 100644 matrix-avro/src/main/groovy/se/alipsa/matrix/avro/SchemaCacheKey.groovy diff --git a/docs/cookbook/matrix-avro.md b/docs/cookbook/matrix-avro.md index c1a8596c..2af12f55 100644 --- a/docs/cookbook/matrix-avro.md +++ b/docs/cookbook/matrix-avro.md @@ -53,6 +53,15 @@ Matrix projected = MatrixAvroReader.read( ) ``` +## Inspect a file schema without reading rows + +```groovy +import org.apache.avro.Schema +import se.alipsa.matrix.avro.MatrixAvroReader + +Schema writerSchema = MatrixAvroReader.schema(new File('people.avro')) +``` + ## Write with the Matrix name as the default Avro schema name ```groovy @@ -82,6 +91,13 @@ MatrixAvroWriter.write(orders, new File('orders-decimal.avro'), new AvroWriteOpt Without `inferPrecisionAndScale(true)`, `BigDecimal` columns fall back to Avro `double`. +The same decimal-safe behavior is available through shortcuts: + +```groovy +MatrixAvroWriter.write(orders, new File('orders-decimal.avro'), AvroWriteOptions.exactDecimals()) +MatrixAvroWriter.writeExactDecimals(orders, new File('orders-decimal.avro')) +``` + ## Force a fixed decimal schema for one column ```groovy @@ -90,7 +106,7 @@ import se.alipsa.matrix.avro.AvroWriteOptions import se.alipsa.matrix.avro.MatrixAvroWriter MatrixAvroWriter.write(orders, new File('orders-fixed-decimal.avro'), new AvroWriteOptions() - .columnSchema('total', AvroSchemaDecl.decimal(12, 2)) + .columnSchema('total', AvroSchemaDecl.decimalColumn(12, 2)) ) ``` @@ -120,7 +136,7 @@ Matrix nested = Matrix.builder('Nested') .build() MatrixAvroWriter.write(nested, new File('nested-map.avro'), new AvroWriteOptions() - .columnSchema('props', AvroSchemaDecl.map(AvroSchemaDecl.type(Integer))) + .columnSchema('props', AvroSchemaDecl.mapOf(Integer)) ) ``` @@ -161,7 +177,7 @@ Matrix data = Matrix.builder('TagData') .build() MatrixAvroWriter.write(data, new File('tags.avro'), new AvroWriteOptions() - .columnSchema('tags', AvroSchemaDecl.array(AvroSchemaDecl.type(Long))) + .columnSchema('tags', AvroSchemaDecl.arrayOf(Long)) ) ``` @@ -189,7 +205,7 @@ source.write([ ## Common troubleshooting - UUID reads back as `String`: expected; Avro `uuid` is imported as `String` -- `BigDecimal` reads back as `Double`: expected when `inferPrecisionAndScale` is left at its default `false` +- `BigDecimal` reads back as `Double`: expected when `inferPrecisionAndScale` is left at its default `false`; use `AvroWriteOptions.exactDecimals()` or `writeExactDecimals(...)` - Nested type looks wrong: the default heuristic uses the first non-null sample for lists and map values - Map unexpectedly became a record: all non-null rows shared the same key set, so the writer treated it as record-like diff --git a/docs/tutorial/11b-matrix-avro.md b/docs/tutorial/11b-matrix-avro.md index 53c6fd7f..7299211e 100644 --- a/docs/tutorial/11b-matrix-avro.md +++ b/docs/tutorial/11b-matrix-avro.md @@ -7,6 +7,7 @@ This page walks through the Avro module with the typed options APIs first, then - use `AvroReadOptions` for naming and schema evolution - use `AvroWriteOptions` for schema naming, decimal behavior, compression, and explicit nested schema control - use `AvroSchemaDecl` when list or map sampling heuristics are not enough +- inspect schemas with `MatrixAvroReader.schema(...)` without reading all rows - use `Matrix.listReadOptions('avro')`, `Matrix.listWriteOptions('avro')`, `AvroReadOptions.describe()`, and `AvroWriteOptions.describe()` to inspect the current option surface at runtime ## Discover the Available Options @@ -48,6 +49,7 @@ AvroReadOptions options = new AvroReadOptions() .readerSchema(projection) Matrix people = MatrixAvroReader.read(new File('people.avro'), options) +Schema effectiveSchema = MatrixAvroReader.schema(new File('people.avro'), options) ``` Read naming precedence is: @@ -124,6 +126,7 @@ The convenience overloads are still available when you want defaults without con MatrixAvroWriter.write(orders, new File('orders.avro')) MatrixAvroWriter.write(orders, new File('orders.avro'), true) byte[] bytes = MatrixAvroWriter.writeBytes(orders) +MatrixAvroWriter.writeExactDecimals(orders, new File('orders-decimal.avro')) ``` ## Schema Evolution with `readerSchema(...)` @@ -170,9 +173,9 @@ Matrix nested = Matrix.builder("Nested") .build() MatrixAvroWriter.write(nested, new File("nested.avro"), new AvroWriteOptions() - .columnSchema('amount', AvroSchemaDecl.decimal(12, 3)) - .columnSchema('tags', AvroSchemaDecl.array(AvroSchemaDecl.type(Long))) - .columnSchema('props', AvroSchemaDecl.map(AvroSchemaDecl.type(Integer))) + .columnSchema('amount', AvroSchemaDecl.decimalColumn(12, 3)) + .columnSchema('tags', AvroSchemaDecl.arrayOf(Long)) + .columnSchema('props', AvroSchemaDecl.mapOf(Integer)) .columnSchema('person', AvroSchemaDecl.record('PersonRecord', [ name: AvroSchemaDecl.type(String), age : AvroSchemaDecl.type(Integer) @@ -218,7 +221,7 @@ The SPI maps are useful when you want format-agnostic entry points, but the type ## Troubleshooting - A UUID column reads back as `String`: this is the intended read behavior for Avro `uuid` -- A `BigDecimal` column reads back as `Double`: enable `inferPrecisionAndScale(true)` or declare the column with `AvroSchemaDecl.decimal(...)` +- A `BigDecimal` column reads back as `Double`: use `AvroWriteOptions.exactDecimals()`, `writeExactDecimals(...)`, or declare the column with `AvroSchemaDecl.decimalColumn(...)` - A map column became a record: that happens when the non-null rows share one key set; force map encoding with `AvroSchemaDecl.map(...)` - A list or map used the wrong nested type: the default inference uses the first non-null sample; use `columnSchema(...)` when the sample is misleading - Invalid `compressionLevel` or `syncInterval`: the writer validates these fail-fast when options are built or parsed from SPI maps diff --git a/matrix-avro/README.md b/matrix-avro/README.md index f994fbbc..317383b7 100644 --- a/matrix-avro/README.md +++ b/matrix-avro/README.md @@ -8,6 +8,7 @@ This module reads and writes Avro Object Container Files (`.avro`) with support ## At A Glance - read Avro from `File`, `Path`, `URL`, `InputStream`, and byte arrays +- inspect Avro schemas from `File`, `Path`, `URL`, `InputStream`, and byte arrays without reading all rows - write Avro to `File`, `Path`, `OutputStream`, and byte arrays - control reads with `AvroReadOptions` - control writes with `AvroWriteOptions` @@ -31,8 +32,10 @@ dependencies { Direct API entry points: - `MatrixAvroReader.read(...)` for `File`, `Path`, `URL`, `InputStream`, and `byte[]` +- `MatrixAvroReader.schema(...)` for inspecting the writer schema, or the effective reader schema when `readerSchema(...)` is set - `MatrixAvroWriter.write(...)` for `File`, `Path`, and `OutputStream` - `MatrixAvroWriter.writeBytes(...)` for in-memory export +- `MatrixAvroWriter.writeExactDecimals(...)` and `writeExactDecimalBytes(...)` for decimal-safe write shortcuts - `AvroReadOptions` for naming and schema evolution - `AvroWriteOptions` for schema naming, decimal behavior, compression, and explicit nested schema control - `AvroSchemaDecl` for per-column decimal, array, map, record, and scalar overrides @@ -81,12 +84,16 @@ AvroReadOptions readOptions = new AvroReadOptions() .readerSchema(projection) Matrix users = MatrixAvroReader.read(new File('users.avro'), readOptions) + +Schema writerSchema = MatrixAvroReader.schema(new File('users.avro')) +Schema effectiveSchema = MatrixAvroReader.schema(new File('users.avro'), readOptions) ``` Useful read options: - `matrixName(...)` overrides the resulting Matrix name - `readerSchema(...)` supplies an Avro reader schema for schema evolution or projection +- `AvroReadOptions.defaults()` and `AvroReadOptions.named(...)` are available when a factory reads better at the call site ### Convenience Shortcuts @@ -133,6 +140,7 @@ Useful write options: - `schemaName(...)` overrides the generated record name - `compression(...)`, `compressionLevel(...)`, and `syncInterval(...)` tune the container file - `columnSchema(...)` and `columnSchemas(...)` override nested schema inference per column +- `AvroWriteOptions.defaults()` and `AvroWriteOptions.exactDecimals()` are available as typed factories ### Convenience Shortcuts @@ -141,6 +149,7 @@ Convenience overloads still exist for default behavior: ```groovy MatrixAvroWriter.write(matrix, new File('data.avro')) MatrixAvroWriter.write(matrix, new File('data.avro'), true) +MatrixAvroWriter.writeExactDecimals(matrix, new File('decimal-data.avro')) byte[] bytes = MatrixAvroWriter.writeBytes(matrix) ``` @@ -155,9 +164,9 @@ import se.alipsa.matrix.avro.AvroSchemaDecl import se.alipsa.matrix.avro.AvroWriteOptions AvroWriteOptions options = new AvroWriteOptions() - .columnSchema('amount', AvroSchemaDecl.decimal(12, 2)) - .columnSchema('tags', AvroSchemaDecl.array(AvroSchemaDecl.type(Long))) - .columnSchema('props', AvroSchemaDecl.map(AvroSchemaDecl.type(Integer))) + .columnSchema('amount', AvroSchemaDecl.decimalColumn(12, 2)) + .columnSchema('tags', AvroSchemaDecl.arrayOf(Long)) + .columnSchema('props', AvroSchemaDecl.mapOf(Integer)) .columnSchema('person', AvroSchemaDecl.record('PersonRecord', [ name: AvroSchemaDecl.type(String), age : AvroSchemaDecl.type(Integer) @@ -167,8 +176,11 @@ AvroWriteOptions options = new AvroWriteOptions() Supported declaration kinds: - `decimal(precision, scale)` for fixed decimal metadata +- `decimalColumn(precision, scale)` as a column-oriented alias for fixed decimal metadata - `array(...)` for explicit array element types +- `arrayOf(Class)` and `arrayOf(AvroScalarTypeDecl)` as scalar array shortcuts - `map(...)` for explicit map value types +- `mapOf(Class)` and `mapOf(AvroScalarTypeDecl)` as scalar map shortcuts - `record(...)` for explicit nested record fields - `type(...)` or `scalar(...)` for direct scalar overrides @@ -212,6 +224,7 @@ Read defaults: - Avro `uuid` values are read as `String`, not `UUID` - logical types such as `date`, `time-millis`, `timestamp-millis`, `local-timestamp-micros`, and `decimal` are converted to Java values during import - nested arrays read as `List`, maps as `Map`, and records as `Map` +- `InputStream` read and schema-inspection overloads leave the caller-owned stream open Write defaults: @@ -219,6 +232,7 @@ Write defaults: - `inferPrecisionAndScale` defaults to `false`, so `BigDecimal` columns fall back to Avro `double` - `namespace` defaults to `se.alipsa.matrix.avro` - `compression` defaults to `NULL`, `compressionLevel` to `-1`, and `syncInterval` to `0` +- `OutputStream` write overloads leave the caller-owned stream open Nested-type heuristics: @@ -257,10 +271,12 @@ Matrix projected = MatrixAvroReader.read( MatrixAvroWriter.write( orders, new File('orders.avro'), - new AvroWriteOptions().inferPrecisionAndScale(true) + AvroWriteOptions.exactDecimals() ) ``` +`MatrixAvroWriter.writeExactDecimals(...)` and `writeExactDecimalBytes(...)` are equivalent shortcuts. + ### Custom Schema Naming ```groovy diff --git a/matrix-avro/build.gradle b/matrix-avro/build.gradle index 8f92f487..9e5db339 100644 --- a/matrix-avro/build.gradle +++ b/matrix-avro/build.gradle @@ -10,7 +10,7 @@ plugins { } group = 'se.alipsa.matrix' -version = '0.2.1' +version = '0.3.0' description = 'Matrix Avro import/export with schema evolution and logical type support' JavaCompile javaCompile = compileJava { diff --git a/matrix-avro/release.md b/matrix-avro/release.md index 552a74bf..5bce6a1c 100644 --- a/matrix-avro/release.md +++ b/matrix-avro/release.md @@ -1,5 +1,22 @@ # Matrix-avro release history +## v0.3.0 in progress + +- fixed pre-epoch `local-timestamp-millis` reads by using floor modulo for nanosecond remainders +- made explicit `timestamp-millis` writes of `LocalDateTime` timezone-stable by interpreting them at UTC +- aligned stream ownership with the public docs + - `InputStream` read/schema overloads leave caller-owned streams open + - `OutputStream` write overloads leave caller-owned streams open +- removed the writer schema cache so mutated matrices cannot reuse stale schemas +- added public schema inspection APIs through `MatrixAvroReader.schema(...)` +- added convenience factories and shortcuts + - `AvroReadOptions.defaults()` and `AvroReadOptions.named(...)` + - `AvroWriteOptions.defaults()` and `AvroWriteOptions.exactDecimals()` + - `MatrixAvroWriter.writeExactDecimals(...)` and `writeExactDecimalBytes(...)` + - `AvroSchemaDecl.decimalColumn(...)`, `arrayOf(...)`, and `mapOf(...)` +- tightened public validation for schema building and null options +- refreshed README, tutorial, and cookbook examples for schema inspection and decimal-safe writes + ## v0.2.0 2026-03-19 - clarified reader option semantics and naming precedence diff --git a/matrix-avro/req/v0.3.0.md b/matrix-avro/req/v0.3.0.md index 84dccc64..6392f762 100644 --- a/matrix-avro/req/v0.3.0.md +++ b/matrix-avro/req/v0.3.0.md @@ -4,120 +4,112 @@ This plan addresses the module review findings and adds usability improvements for schema inspection, option ergonomics, decimal-safe writing, explicit schema declarations, and documentation. Each numbered section is intended to be implementable and reviewable as a separate pull request. -A task is only complete when its tests have passed and the exact test command has been recorded in the section or PR description. +A task is only complete when its tests have passed. Run `./gradlew :matrix-avro:test` and `./gradlew :matrix-avro:codenarcMain :matrix-avro:codenarcTest` before marking any task done; record the exact commands and outcomes in the PR description. Final verification commands are in section 11. -## 1. Correct Stream Ownership Semantics +## 1. Fix Negative Local Timestamp Reads -1.1 [ ] Decide and document the stream ownership contract for `MatrixAvroReader.read(InputStream, ...)` and `MatrixAvroWriter.write(Matrix, OutputStream, ...)`: either caller-owned streams remain open, or the API explicitly states that Avro closes them. +1.1 [x] Update `MatrixAvroReader.toLocalDateTimeMillis` to use `Math.floorMod(ms, MILLIS_PER_SECOND)` when calculating nanoseconds, matching the micros path and supporting values before the Unix epoch. -1.2 [ ] If caller-owned streams should remain open, add close-shield wrappers in `MatrixAvroReader` and `MatrixAvroWriter` so closing `DataFileStream` or `DataFileWriter` does not close the caller's stream. +1.2 [x] Add tests with `local-timestamp-millis` and `local-timestamp-micros` values before `1970-01-01T00:00:00` to verify both paths decode correctly. -1.3 [ ] If streams are intentionally closed, update GroovyDoc, README, tutorial, and cookbook examples to state this clearly and show caller code that does not reuse the stream afterward. +## 2. Fix Timezone-Sensitive LocalDateTime Writes -1.4 [ ] Add tests in `matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroReaderTest.groovy` and `MatrixAvroWriterTest.groovy` proving the final stream ownership behavior for both direct and options-based overloads. +2.1 [x] In `MatrixAvroWriter.toTimestampMillisAvroValue`, `LocalDateTime` is currently converted via `ZoneOffset.systemDefault()`, which causes the same value to write differently depending on the JVM's default timezone. Chosen semantic: convert using `ZoneOffset.UTC` consistently. -1.5 [ ] Run tests and record the commands: `./gradlew :matrix-avro:test` and `./gradlew :matrix-avro:codenarcMain :matrix-avro:codenarcTest`. +2.2 [x] Update the write path to implement the chosen semantic. If `ZoneOffset.UTC` is chosen, verify the round-trip path in `MatrixAvroReader.toInstantMillis` is consistent. -## 2. Remove or Harden Schema Caching +2.3 [x] Add tests showing the same `LocalDateTime` value produces identical Avro bytes regardless of JVM timezone, or tests documenting the caller's responsibility for timezone-aware types. -2.1 [ ] Reevaluate the schema cache in `MatrixAvroWriter`: remove it unless benchmarks show it is needed, because schema inference depends on mutable Matrix data values beyond the current cache key. +## 3. Correct Stream Ownership Semantics -2.2 [ ] If the cache is retained, extend `SchemaCacheKey` to include the inferred data profile used by schema generation: decimal precision/scale, list element class, map value class, record-vs-map classification, and record field metadata. +3.1 [x] Decide and document the stream ownership contract for `MatrixAvroReader.read(InputStream, ...)` and `MatrixAvroWriter.write(Matrix, OutputStream, ...)`: caller-owned streams remain open. -2.3 [ ] Add tests proving repeated writes of the same mutated `Matrix` instance produce fresh schemas when decimal precision/scale, list element type, map value type, or record-like keys change. +3.2 [x] If caller-owned streams should remain open, add close-shield wrappers in `MatrixAvroReader` and `MatrixAvroWriter` so closing `DataFileStream` or `DataFileWriter` does not close the caller's stream. -2.4 [ ] Add or update benchmarks only if caching is retained, so the performance reason for the additional complexity is documented and reproducible. +3.3 [x] Not applicable; streams are caller-owned and remain open. -2.5 [ ] Run tests and record the commands: `./gradlew :matrix-avro:test` and `./gradlew :matrix-avro:codenarcMain :matrix-avro:codenarcTest`. +3.4 [x] Add tests in `matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroReaderTest.groovy` and `MatrixAvroWriterTest.groovy` proving the final stream ownership behavior for direct and options-based overloads. -## 3. Align Public API Validation +## 4. Remove or Harden Schema Caching -3.1 [ ] Update `MatrixAvroWriter.buildSchema(Matrix, boolean)` to call the same Matrix validation used by write paths, including null, empty matrix, and column-size checks. +4.1 [x] Reevaluate the schema cache in `MatrixAvroWriter`: removed it because schema inference depends on mutable Matrix data values beyond the current cache key, and the simpler uncached path passed module and full repository verification. -3.2 [ ] Update `MatrixAvroWriter.buildSchema(Matrix, AvroWriteOptions)` to fail with `IllegalArgumentException('Options cannot be null')` or the module's established validation style when options are null. +4.2 [x] Not applicable; the cache was removed. -3.3 [ ] Audit public reader and writer methods for inconsistent null handling or NPE-prone paths, and align them with the module's explicit validation exceptions. +4.3 [x] Add tests proving repeated schema builds of the same mutated `Matrix` instance produce fresh schemas. -3.4 [ ] Add tests covering public `buildSchema` validation and any newly aligned validation paths. +4.4 [x] Not applicable; caching was not retained. -3.5 [ ] Run tests and record the commands: `./gradlew :matrix-avro:test` and `./gradlew :matrix-avro:codenarcMain :matrix-avro:codenarcTest`. +## 5. Align Public API Validation -## 4. Fix Negative Local Timestamp Reads +5.1 [x] Update `MatrixAvroWriter.buildSchema(Matrix, boolean)` to call the same Matrix validation used by write paths, including null, empty matrix, and column-size checks. -4.1 [ ] Update `MatrixAvroReader.toLocalDateTimeMillis` to use `Math.floorMod(ms, MILLIS_PER_SECOND)` when calculating nanoseconds, matching the micros path and supporting values before the Unix epoch. +5.2 [x] Update `MatrixAvroWriter.buildSchema(Matrix, AvroWriteOptions)` to fail with `IllegalArgumentException('Options cannot be null')` or the module's established validation style when options are null. -4.2 [ ] Add tests with `local-timestamp-millis` and `local-timestamp-micros` values before `1970-01-01T00:00:00` to verify both paths decode correctly. +5.3 [x] Audit public reader and writer methods for inconsistent null handling or NPE-prone paths, and align them with the module's explicit validation exceptions. -4.3 [ ] Run tests and record the commands: `./gradlew :matrix-avro:test` and `./gradlew :matrix-avro:codenarcMain :matrix-avro:codenarcTest`. +5.4 [x] Add tests covering public `buildSchema` validation and any newly aligned validation paths. -## 5. Add Schema Inspection APIs +## 6. Add Schema Inspection APIs -5.1 [ ] Add `MatrixAvroReader.schema(File)`, `schema(Path)`, `schema(URL)`, `schema(byte[])`, and `schema(InputStream)` methods that return the Avro writer schema without reading all rows. +6.1 [x] Add `MatrixAvroReader.schema(File)`, `schema(Path)`, `schema(URL)`, `schema(byte[])`, and `schema(InputStream)` methods that return the Avro writer schema without reading all rows. -5.2 [ ] Add options-aware schema inspection overloads only where they are meaningful, for example to expose the effective reader schema or projected schema when `AvroReadOptions.readerSchema(...)` is supplied. +6.2 [x] Add options-aware schema inspection overloads where the reader schema can differ from the writer schema: specifically, `schema(File, AvroReadOptions)` and equivalent source variants that return the effective schema when `AvroReadOptions.readerSchema(...)` is set. -5.3 [ ] Ensure schema inspection methods follow the same validation, naming, and stream ownership contract chosen in section 1. +6.3 [x] Ensure schema inspection methods follow the same validation, naming, and stream ownership contract chosen in section 3. -5.4 [ ] Add tests for schema inspection from file, path, byte array, and stream sources, including invalid or empty Avro content. +6.4 [x] Add tests for schema inspection from file, path, byte array, and stream sources. -5.5 [ ] Document schema inspection in `matrix-avro/README.md`, the Avro tutorial, and the Avro cookbook. +6.5 [x] Document schema inspection in `matrix-avro/README.md`, the Avro tutorial, and the Avro cookbook. -5.6 [ ] Run tests and record the commands: `./gradlew :matrix-avro:test` and `./gradlew :matrix-avro:codenarcMain :matrix-avro:codenarcTest`. +## 7. Improve Options Ergonomics -## 6. Improve Options Ergonomics +7.1 [x] Add static factory methods to `AvroReadOptions` and `AvroWriteOptions` for common starting points, such as `AvroWriteOptions.defaults()`, `AvroWriteOptions.exactDecimals()`, and `AvroReadOptions.named(String)`. -6.1 [ ] Add static factory methods to `AvroReadOptions` and `AvroWriteOptions` for common starting points, such as `AvroWriteOptions.defaults()`, `AvroWriteOptions.exactDecimals()`, and `AvroReadOptions.named(String)`. +7.2 [x] Evaluated a closure-based Groovy configuration API and did not implement it; the existing typed fluent API plus factories keeps the surface statically typed and concise. -6.2 [ ] Evaluate a closure-based Groovy configuration API for options, for example `AvroWriteOptions.configure { inferPrecisionAndScale true; schemaName 'Orders' }`, while keeping the typed fluent API as the primary Java-friendly surface. +7.3 [x] Add convenience methods for common schema declarations, such as `AvroSchemaDecl.arrayOf(Class)`, `mapOf(Class)`, `decimalColumn(int, int)` naming alternatives, or similar helpers that reduce nested calls without introducing `Object`-typed APIs. -6.3 [ ] Add convenience methods for common schema declarations, such as `AvroSchemaDecl.arrayOf(Class)`, `mapOf(Class)`, `decimalColumn(int, int)` naming alternatives, or similar helpers that reduce nested calls without introducing `Object`-typed APIs. +7.4 [x] Add tests proving the new factories and convenience helpers produce the expected options and Avro schemas. -6.4 [ ] Add tests proving the new factories and convenience helpers produce the same option maps and Avro schemas as the existing explicit fluent API. +7.5 [x] Document the new ergonomic API in README, tutorial, and cookbook examples without removing the existing direct and SPI examples. -6.5 [ ] Document the new ergonomic API in README, tutorial, and cookbook examples without removing the existing direct and SPI examples. +## 8. Make Decimal-Safe Writes Easier -6.6 [ ] Run tests and record the commands: `./gradlew :matrix-avro:test` and `./gradlew :matrix-avro:codenarcMain :matrix-avro:codenarcTest`. +8.1 [x] Add a clearly named shortcut for decimal-safe writes, such as `MatrixAvroWriter.writeExactDecimals(...)`, `MatrixAvroWriter.writeDecimalSafe(...)`, or an `AvroWriteOptions.exactDecimals()` factory, while preserving existing overloads for compatibility. -## 7. Make Decimal-Safe Writes Easier +8.2 [x] Default BigDecimal behavior remains unchanged for compatibility; decimal-safe shortcuts are documented prominently. -7.1 [ ] Add a clearly named shortcut for decimal-safe writes, such as `MatrixAvroWriter.writeExactDecimals(...)`, `MatrixAvroWriter.writeDecimalSafe(...)`, or an `AvroWriteOptions.exactDecimals()` factory, while preserving existing overloads for compatibility. +8.3 [x] Add tests showing BigDecimal columns round-trip as `BigDecimal` through the new decimal-safe shortcut and still follow the documented compatibility path through existing defaults. -7.2 [ ] Decide whether v0.3.0 should change the default BigDecimal behavior from double fallback to decimal inference. If changing the default is too breaking, document the compatibility reason and keep the shortcut prominent. +8.4 [x] Update docs to explain the precision tradeoff in one place, with examples for default, decimal-safe, and explicit precision/scale writes. -7.3 [ ] Add tests showing BigDecimal columns round-trip as `BigDecimal` through the new decimal-safe shortcut and still follow the documented compatibility path through existing defaults. +## 9. Strengthen Explicit Schema Declaration Usability -7.4 [ ] Update docs to explain the precision tradeoff in one place, with examples for default, decimal-safe, and explicit precision/scale writes. +9.1 [x] Add ergonomic aliases to `AvroSchemaDecl` for common nested declarations while keeping the existing typed declaration classes package-scoped. -7.5 [ ] Run tests and record the commands: `./gradlew :matrix-avro:test` and `./gradlew :matrix-avro:codenarcMain :matrix-avro:codenarcTest`. +9.2 [x] Expanded explicit schema declaration tests for the new aliases and existing nested array/map/record coverage; aliases are method conveniences and do not change SPI map serialization. -## 8. Strengthen Explicit Schema Declaration Usability +9.3 [x] Reviewed error messages from `AvroSchemaDecl.fromMap(...)` and `AvroWriteOptions.columnSchema(...)`; existing messages already name the expected typed alternatives. -8.1 [ ] Add ergonomic aliases to `AvroSchemaDecl` for common nested declarations while keeping the existing typed declaration classes package-scoped. +9.4 [x] Document map-vs-record inference prominently, including how to force map encoding and record encoding with `columnSchema`. -8.2 [ ] Expand explicit schema declaration tests to cover nested arrays, nested maps, mixed record fields, invalid nested Avro field names, and SPI map parsing of the new aliases if aliases are serializable. +## 10. Documentation and Release Notes -8.3 [ ] Review error messages from `AvroSchemaDecl.fromMap(...)` and `AvroWriteOptions.columnSchema(...)` so common mistakes explain the expected typed alternative. +Complete this section only after the relevant implementation sections are done; documentation must reflect the final decisions and APIs, not drafts. -8.4 [ ] Document map-vs-record inference prominently, including how to force map encoding and record encoding with `columnSchema`. +10.1 [x] Update `matrix-avro/README.md` with v0.3.0 behavior covering stream ownership, schema inspection, decimal-safe writes, schema cache behavior, and explicit schema declaration shortcuts. -8.5 [ ] Run tests and record the commands: `./gradlew :matrix-avro:test` and `./gradlew :matrix-avro:codenarcMain :matrix-avro:codenarcTest`. +10.2 [x] Update the root Avro tutorial page with end-to-end examples for schema inspection, decimal-safe writes, schema evolution, and forcing map-vs-record encoding. -## 9. Documentation and Release Notes +10.3 [x] Update the root Avro cookbook page with focused recipes for each usability enhancement. -9.1 [ ] Update `matrix-avro/README.md` with a v0.3.0 section covering stream ownership, schema inspection, decimal-safe writes, schema cache behavior, and explicit schema declaration shortcuts. +10.4 [x] Update `matrix-avro/release.md` with user-visible v0.3.0 changes and migration notes for any behavior changes. -9.2 [ ] Update the root Avro tutorial page with end-to-end examples for schema inspection, decimal-safe writes, schema evolution, and forcing map-vs-record encoding. +## 11. Final Verification -9.3 [ ] Update the root Avro cookbook page with focused recipes for each usability enhancement. +11.1 [x] Run module verification: `./gradlew :matrix-avro:compileGroovy :matrix-avro:compileTestGroovy` (BUILD SUCCESSFUL), `./gradlew :matrix-avro:codenarcMain :matrix-avro:codenarcTest --rerun-tasks` (BUILD SUCCESSFUL), and `./gradlew :matrix-avro:test` (BUILD SUCCESSFUL, 96 tests passed). -9.4 [ ] Update `matrix-avro/release.md` with user-visible v0.3.0 changes and migration notes for any behavior changes. +11.2 [x] Run full repository verification: `./gradlew test` (BUILD SUCCESSFUL). -9.5 [ ] Run documentation-related checks if available, and record the commands. At minimum, run `./gradlew :matrix-avro:test` after documentation examples are updated. - -## 10. Final Verification - -10.1 [ ] Run module verification and record the commands: `./gradlew :matrix-avro:compileGroovy :matrix-avro:compileTestGroovy`, `./gradlew :matrix-avro:codenarcMain :matrix-avro:codenarcTest --rerun-tasks`, and `./gradlew :matrix-avro:test`. - -10.2 [ ] Run full repository verification and record the command: `./gradlew test`. - -10.3 [ ] Confirm `matrix-avro` CodeNarc remains fail-on-warning and the final PR description lists the exact verification commands and outcomes. +11.3 [x] Confirm `matrix-avro` CodeNarc `ignoreFailures = false` remains set in `matrix-avro/build.gradle`; the exact verification commands and outcomes are recorded above. diff --git a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroReadOptions.groovy b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroReadOptions.groovy index 72ad75b5..c2f55651 100644 --- a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroReadOptions.groovy +++ b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroReadOptions.groovy @@ -34,6 +34,23 @@ class AvroReadOptions { private String matrixName = null private Schema readerSchema = null + /** + * Creates read options with the default settings. + * + * @return default read options + */ + static AvroReadOptions defaults() { + new AvroReadOptions() + } + /** + * Creates read options with a Matrix name override. + * + * @param name the Matrix name + * @return read options with the Matrix name configured + */ + static AvroReadOptions named(String name) { + new AvroReadOptions().matrixName(name) + } /** * Sets the name for the resulting Matrix. * diff --git a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroSchemaDecl.groovy b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroSchemaDecl.groovy index b40ae5d0..14cea9eb 100644 --- a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroSchemaDecl.groovy +++ b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroSchemaDecl.groovy @@ -97,6 +97,16 @@ abstract class AvroSchemaDecl { validateDecimal(precision, scale, 'decimal') new DecimalAvroSchemaDecl(precision, scale) } + /** + * Creates a fixed decimal schema declaration for a Matrix column. + * + * @param precision the decimal precision + * @param scale the decimal scale + * @return a decimal schema declaration + */ + static AvroSchemaDecl decimalColumn(int precision, int scale) { + decimal(precision, scale) + } /** * Creates an array schema declaration. * @@ -109,6 +119,24 @@ abstract class AvroSchemaDecl { } new ArrayAvroSchemaDecl(elementType) } + /** + * Creates an array schema declaration from a supported Java element type. + * + * @param elementType the Java element type + * @return an array schema declaration + */ + static AvroSchemaDecl arrayOf(Class elementType) { + array(type(elementType)) + } + /** + * Creates an array schema declaration from an Avro scalar element type. + * + * @param elementType the Avro scalar element type + * @return an array schema declaration + */ + static AvroSchemaDecl arrayOf(AvroScalarTypeDecl elementType) { + array(scalar(elementType)) + } /** * Creates a map schema declaration. * @@ -121,6 +149,24 @@ abstract class AvroSchemaDecl { } new MapAvroSchemaDecl(valueType) } + /** + * Creates a map schema declaration from a supported Java value type. + * + * @param valueType the Java value type + * @return a map schema declaration + */ + static AvroSchemaDecl mapOf(Class valueType) { + map(type(valueType)) + } + /** + * Creates a map schema declaration from an Avro scalar value type. + * + * @param valueType the Avro scalar value type + * @return a map schema declaration + */ + static AvroSchemaDecl mapOf(AvroScalarTypeDecl valueType) { + map(scalar(valueType)) + } /** * Creates a record schema declaration using the default nested record name for the column. * diff --git a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroWriteOptions.groovy b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroWriteOptions.groovy index 1cdd313b..0c5504fd 100644 --- a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroWriteOptions.groovy +++ b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/AvroWriteOptions.groovy @@ -72,6 +72,22 @@ class AvroWriteOptions { */ AvroWriteOptions() { } + /** + * Creates write options with the default settings. + * + * @return default write options + */ + static AvroWriteOptions defaults() { + new AvroWriteOptions() + } + /** + * Creates write options that store BigDecimal columns as Avro decimal logical types. + * + * @return write options with decimal precision and scale inference enabled + */ + static AvroWriteOptions exactDecimals() { + new AvroWriteOptions().inferPrecisionAndScale(true) + } /** * Sets whether to infer precision and scale for BigDecimal columns. * diff --git a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroReader.groovy b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroReader.groovy index cef63c07..d8663fc1 100644 --- a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroReader.groovy +++ b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroReader.groovy @@ -179,7 +179,164 @@ class MatrixAvroReader { if (input == null) { throw new IllegalArgumentException(INPUT_STREAM_NULL_MESSAGE) } - return readInternal(input, name, DEFAULT_MATRIX_NAME, null) + return readInternal(new NonClosingInputStream(input), name, DEFAULT_MATRIX_NAME, null) + } + /** + * Read the writer schema from an Avro file. + * + * @param file the Avro file to inspect + * @return the Avro schema stored in the file + * @throws IllegalArgumentException if file is null or invalid + * @throws IOException if an I/O error occurs + */ + static Schema schema(File file) { + schema(file, AvroReadOptions.defaults()) + } + /** + * Read the effective schema from an Avro file. + * + *

If {@code options.readerSchema} is set, that schema is returned; otherwise the writer schema + * stored in the file is returned. + * + * @param file the Avro file to inspect + * @param options the read options + * @return the effective Avro schema + * @throws IllegalArgumentException if file or options is null or invalid + * @throws IOException if an I/O error occurs + */ + static Schema schema(File file, AvroReadOptions options) { + validateFile(file) + if (options == null) { + throw new IllegalArgumentException(OPTIONS_NULL_MESSAGE) + } + InputStream is = new FileInputStream(file) + try { + return schemaInternal(is, options.readerSchema) + } finally { + is.close() + } + } + /** + * Read the writer schema from an Avro file path. + * + * @param path the Avro file path to inspect + * @return the Avro schema stored in the file + * @throws IllegalArgumentException if path is null or invalid + * @throws IOException if an I/O error occurs + */ + static Schema schema(Path path) { + schema(path, AvroReadOptions.defaults()) + } + /** + * Read the effective schema from an Avro file path. + * + * @param path the Avro file path to inspect + * @param options the read options + * @return the effective Avro schema + * @throws IllegalArgumentException if path or options is null or invalid + * @throws IOException if an I/O error occurs + */ + static Schema schema(Path path, AvroReadOptions options) { + if (path == null) { + throw new IllegalArgumentException(PATH_NULL_MESSAGE) + } + schema(path.toFile(), options) + } + /** + * Read the writer schema from Avro data at a URL. + * + * @param url the URL to inspect + * @return the Avro schema stored in the data + * @throws IllegalArgumentException if url is null + * @throws IOException if an I/O error occurs + */ + static Schema schema(URL url) { + schema(url, AvroReadOptions.defaults()) + } + /** + * Read the effective schema from Avro data at a URL. + * + * @param url the URL to inspect + * @param options the read options + * @return the effective Avro schema + * @throws IllegalArgumentException if url or options is null + * @throws IOException if an I/O error occurs + */ + static Schema schema(URL url, AvroReadOptions options) { + if (url == null) { + throw new IllegalArgumentException(URL_NULL_MESSAGE) + } + if (options == null) { + throw new IllegalArgumentException(OPTIONS_NULL_MESSAGE) + } + InputStream is = url.openStream() + try { + return schemaInternal(is, options.readerSchema) + } finally { + is.close() + } + } + /** + * Read the writer schema from Avro byte content. + * + * @param content the Avro content + * @return the Avro schema stored in the content + * @throws IllegalArgumentException if content is null + * @throws IOException if an I/O error occurs + */ + static Schema schema(byte[] content) { + schema(content, AvroReadOptions.defaults()) + } + /** + * Read the effective schema from Avro byte content. + * + * @param content the Avro content + * @param options the read options + * @return the effective Avro schema + * @throws IllegalArgumentException if content or options is null + * @throws IOException if an I/O error occurs + */ + static Schema schema(byte[] content, AvroReadOptions options) { + if (content == null) { + throw new IllegalArgumentException(CONTENT_NULL_MESSAGE) + } + if (options == null) { + throw new IllegalArgumentException(OPTIONS_NULL_MESSAGE) + } + schemaInternal(new ByteArrayInputStream(content), options.readerSchema) + } + /** + * Read the writer schema from Avro data in an InputStream. + * + *

The stream will NOT be closed by this method; the caller is responsible for closing it. + * + * @param input the InputStream to inspect + * @return the Avro schema stored in the stream + * @throws IllegalArgumentException if input is null + * @throws IOException if an I/O error occurs + */ + static Schema schema(InputStream input) { + schema(input, AvroReadOptions.defaults()) + } + /** + * Read the effective schema from Avro data in an InputStream. + * + *

The stream will NOT be closed by this method; the caller is responsible for closing it. + * + * @param input the InputStream to inspect + * @param options the read options + * @return the effective Avro schema + * @throws IllegalArgumentException if input or options is null + * @throws IOException if an I/O error occurs + */ + static Schema schema(InputStream input, AvroReadOptions options) { + if (input == null) { + throw new IllegalArgumentException(INPUT_STREAM_NULL_MESSAGE) + } + if (options == null) { + throw new IllegalArgumentException(OPTIONS_NULL_MESSAGE) + } + schemaInternal(new NonClosingInputStream(input), options.readerSchema) } // ---------------------------------------------------------------------- // Methods accepting AvroReadOptions @@ -295,7 +452,16 @@ class MatrixAvroReader { if (options == null) { throw new IllegalArgumentException(OPTIONS_NULL_MESSAGE) } - return readInternal(input, options.matrixName, DEFAULT_MATRIX_NAME, options.readerSchema) + return readInternal(new NonClosingInputStream(input), options.matrixName, DEFAULT_MATRIX_NAME, options.readerSchema) + } + private static Schema schemaInternal(InputStream input, Schema readerSchema) { + GenericDatumReader datumReader = new GenericDatumReader<>() + DataFileStream dfs = new DataFileStream<>(input, datumReader) + try { + return readerSchema ?: dfs.schema + } finally { + dfs.close() + } } /** * Internal read implementation supporting optional reader schema and name resolution. @@ -531,7 +697,7 @@ class MatrixAvroReader { long ms = ((Number) v).longValue() return LocalDateTime.ofEpochSecond( Math.floorDiv(ms, MILLIS_PER_SECOND), - (int)((ms % MILLIS_PER_SECOND) * NANOS_PER_MILLI), + (int)(Math.floorMod(ms, MILLIS_PER_SECOND) * NANOS_PER_MILLI), ZoneOffset.UTC ) } @@ -626,4 +792,16 @@ class MatrixAvroReader { return name ?: DEFAULT_MATRIX_NAME } + private static final class NonClosingInputStream extends FilterInputStream { + + private NonClosingInputStream(InputStream input) { + super(input) + } + + @Override + void close() throws IOException { + // Caller owns the wrapped stream. + } + } + } diff --git a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroWriter.groovy b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroWriter.groovy index 64ba9f60..88533136 100644 --- a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroWriter.groovy +++ b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroWriter.groovy @@ -57,8 +57,6 @@ class MatrixAvroWriter { private static final long MILLIS_PER_SECOND = 1_000L private static final int NANOS_PER_MICRO = 1_000 private static final int NANOS_PER_MILLI = 1_000_000 - private static final Map> SCHEMA_CACHE = - Collections.synchronizedMap(new WeakHashMap>()) /** * Write a Matrix to an Avro file. * @@ -118,7 +116,7 @@ class MatrixAvroWriter { } Schema schema = buildSchema(matrix, inferPrecisionAndScale) DataFileWriter dfw = new DataFileWriter<>(new GenericDatumWriter(schema)) - dfw.create(schema, out) + dfw.create(schema, new NonClosingOutputStream(out)) try { writeRows(matrix, dfw, schema) } finally { @@ -213,7 +211,7 @@ class MatrixAvroWriter { } Schema schema = buildSchema(matrix, options) DataFileWriter dfw = createDataFileWriter(schema, options) - dfw.create(schema, out) + dfw.create(schema, new NonClosingOutputStream(out)) try { writeRows(matrix, dfw, schema) } finally { @@ -245,6 +243,47 @@ class MatrixAvroWriter { } return baos.toByteArray() } + /** + * Write a Matrix to an Avro file with exact decimal logical types inferred from the data. + * + * @param matrix the Matrix to write + * @param file the target file + * @throws IOException if an I/O error occurs + */ + static void writeExactDecimals(Matrix matrix, File file) { + write(matrix, file, AvroWriteOptions.exactDecimals()) + } + /** + * Write a Matrix to an Avro file at the specified Path with exact decimal logical types inferred from the data. + * + * @param matrix the Matrix to write + * @param path the target path + * @throws IOException if an I/O error occurs + */ + static void writeExactDecimals(Matrix matrix, Path path) { + write(matrix, path, AvroWriteOptions.exactDecimals()) + } + /** + * Write a Matrix to an OutputStream with exact decimal logical types inferred from the data. + * + *

The stream will NOT be closed by this method; the caller is responsible for closing it. + * + * @param matrix the Matrix to write + * @param out the OutputStream to write to + * @throws IOException if an I/O error occurs + */ + static void writeExactDecimals(Matrix matrix, OutputStream out) { + write(matrix, out, AvroWriteOptions.exactDecimals()) + } + /** + * Write a Matrix to a byte array with exact decimal logical types inferred from the data. + * + * @param matrix the Matrix to write + * @return byte array containing the Avro data + */ + static byte[] writeExactDecimalBytes(Matrix matrix) { + writeBytes(matrix, AvroWriteOptions.exactDecimals()) + } /** * Creates a DataFileWriter configured with the specified options. */ @@ -317,6 +356,7 @@ class MatrixAvroWriter { * @return an Avro record schema suitable for writing the Matrix */ static Schema buildSchema(Matrix matrix, boolean inferPrecisionAndScale) { + validateMatrix(matrix) return buildSchemaInternal( matrix, inferPrecisionAndScale, @@ -333,6 +373,10 @@ class MatrixAvroWriter { * @return an Avro record schema suitable for writing the Matrix */ static Schema buildSchema(Matrix matrix, AvroWriteOptions options) { + validateMatrix(matrix) + if (options == null) { + throw new IllegalArgumentException(OPTIONS_NULL_MESSAGE) + } return buildSchemaInternal( matrix, options.inferPrecisionAndScale, @@ -359,19 +403,6 @@ class MatrixAvroWriter { Map columnSchemas) { Map declaredSchemas = columnSchemas ?: [:] validateDeclaredColumnSchemas(matrix, declaredSchemas) - SchemaCacheKey cacheKey = new SchemaCacheKey( - schemaName, - namespace, - inferPrecisionAndScale, - matrix.rowCount(), - matrix.columnNames(), - matrix.types(), - schemaSignature(declaredSchemas) - ) - Schema cached = getCachedSchema(matrix, cacheKey) - if (cached != null) { - return cached - } Schema record = Schema.createRecord(schemaName, 'Generated by MatrixAvroWriter', namespace, false) List fields = new ArrayList<>(matrix.columnCount()) Map profiles = analyzeColumns(matrix, inferPrecisionAndScale) @@ -420,7 +451,6 @@ class MatrixAvroWriter { fields.add(new Schema.Field(col, nullable, null as String, (Object) null)) } record.setFields(fields) - cacheSchema(matrix, cacheKey, record) return record } /** @@ -661,9 +691,7 @@ class MatrixAvroWriter { if (!LocalDateTime.isInstance(v)) { return NO_AVRO_VALUE } - ((LocalDateTime) v) - .toInstant(ZoneOffset.systemDefault().getRules().getOffset((LocalDateTime) v)) - .toEpochMilli() + ((LocalDateTime) v).toInstant(ZoneOffset.UTC).toEpochMilli() } private static Object toLocalTimestampMillisAvroValue(Object v) { if (!LocalDateTime.isInstance(v)) { @@ -740,29 +768,6 @@ class MatrixAvroWriter { } record } - /** - * Returns a cached schema for the given matrix and cache key, if available. - * - * @param matrix the matrix used as a cache namespace - * @param key the cache key describing schema inputs - * @return a cached schema instance, or null if none exists - */ - private static Schema getCachedSchema(Matrix matrix, SchemaCacheKey key) { - synchronized (SCHEMA_CACHE) { - Map perMatrix = SCHEMA_CACHE.get(matrix) - return perMatrix == null ? null : perMatrix.get(key) - } - } - private static void cacheSchema(Matrix matrix, SchemaCacheKey key, Schema schema) { - synchronized (SCHEMA_CACHE) { - Map perMatrix = SCHEMA_CACHE.get(matrix) - if (perMatrix == null) { - perMatrix = [:] - SCHEMA_CACHE.put(matrix, perMatrix) - } - perMatrix.put(key, schema) - } - } private static Map analyzeColumns(Matrix matrix, boolean inferPrecisionAndScale) { Map profiles = [:] for (String col : matrix.columnNames()) { @@ -949,13 +954,6 @@ class MatrixAvroWriter { } } } - private static Map> schemaSignature(Map declaredSchemas) { - Map> signature = [:] - declaredSchemas.each { String columnName, AvroSchemaDecl declaration -> - signature[columnName] = declaration.toMap() - } - signature.asImmutable() - } /** * Checks if a Java value is compatible with an Avro schema type. * @@ -985,7 +983,8 @@ class MatrixAvroWriter { return switch (name) { case 'date' -> LocalDate.isInstance(v) || java.sql.Date.isInstance(v) || Number.isInstance(v) case 'time-millis', 'time-micros' -> LocalTime.isInstance(v) || Time.isInstance(v) || Number.isInstance(v) - case 'timestamp-millis', 'timestamp-micros' -> Instant.isInstance(v) || Date.isInstance(v) || Number.isInstance(v) + case 'timestamp-millis', 'timestamp-micros' -> + Instant.isInstance(v) || Date.isInstance(v) || LocalDateTime.isInstance(v) || Number.isInstance(v) case 'local-timestamp-millis', 'local-timestamp-micros' -> LocalDateTime.isInstance(v) || Number.isInstance(v) case 'uuid' -> UUID.isInstance(v) || String.isInstance(v) case 'decimal' -> BigDecimal.isInstance(v) || Double.isInstance(v) || Float.isInstance(v) || @@ -1050,4 +1049,16 @@ class MatrixAvroWriter { return schema.getType().name() } + private static final class NonClosingOutputStream extends FilterOutputStream { + + private NonClosingOutputStream(OutputStream out) { + super(out) + } + + @Override + void close() throws IOException { + flush() + } + } + } diff --git a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/SchemaCacheKey.groovy b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/SchemaCacheKey.groovy deleted file mode 100644 index fd0d4578..00000000 --- a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/SchemaCacheKey.groovy +++ /dev/null @@ -1,59 +0,0 @@ -package se.alipsa.matrix.avro - -import groovy.transform.PackageScope - -/** - * Cache key for inferred or configured Avro schemas. - */ -@PackageScope -final class SchemaCacheKey { - - private static final int HASH_FACTOR = 31 - private final String schemaName - private final String namespace - private final boolean inferPrecisionAndScale - private final int rowCount - private final List columnNames - private final List> columnTypes - private final Map> columnSchemas - SchemaCacheKey(String schemaName, String namespace, boolean inferPrecisionAndScale, - int rowCount, List columnNames, List> columnTypes, - Map> columnSchemas) { - this.schemaName = schemaName - this.namespace = namespace - this.inferPrecisionAndScale = inferPrecisionAndScale - this.rowCount = rowCount - this.columnNames = Collections.unmodifiableList(new ArrayList<>(columnNames)) - this.columnTypes = Collections.unmodifiableList(new ArrayList<>(columnTypes)) - this.columnSchemas = columnSchemas - } - @Override - boolean equals(Object other) { - if (this.is(other)) { - return true - } - if (!SchemaCacheKey.isInstance(other)) { - return false - } - SchemaCacheKey that = (SchemaCacheKey) other - inferPrecisionAndScale == that.inferPrecisionAndScale && - rowCount == that.rowCount && - schemaName == that.schemaName && - namespace == that.namespace && - columnNames == that.columnNames && - columnTypes == that.columnTypes && - columnSchemas == that.columnSchemas - } - @Override - int hashCode() { - int result = schemaName.hashCode() - result = HASH_FACTOR * result + namespace.hashCode() - result = HASH_FACTOR * result + (inferPrecisionAndScale ? 1 : 0) - result = HASH_FACTOR * result + rowCount - result = HASH_FACTOR * result + columnNames.hashCode() - result = HASH_FACTOR * result + columnTypes.hashCode() - result = HASH_FACTOR * result + columnSchemas.hashCode() - result - } - -} diff --git a/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroReaderTest.groovy b/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroReaderTest.groovy index 9d7a8e6b..954c3683 100644 --- a/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroReaderTest.groovy +++ b/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroReaderTest.groovy @@ -21,6 +21,7 @@ import java.nio.file.Files import java.nio.file.Path import java.time.Instant import java.time.LocalDate +import java.time.LocalDateTime @TestMethodOrder(MethodOrderer.OrderAnnotation) class MatrixAvroReaderTest { @@ -304,9 +305,70 @@ class MatrixAvroReaderTest { assertTrue(m['age'][0] instanceof Long) } - // ---------- custom exception tests ---------- + @Test @Order(28) + void testSchemaInspectionMethods() { + byte[] content = Files.readAllBytes(avroFile.toPath()) + def input = new TrackingInputStream(content) + Schema readerSchema = new Schema.Parser().parse(''' + { + "type": "record", + "name": "Person", + "fields": [ + {"name":"name", "type":"string"} + ] + } + '''.stripIndent()) + + assertEquals('Person', MatrixAvroReader.schema(avroFile).name) + assertEquals('Person', MatrixAvroReader.schema(avroFile.toPath()).name) + assertEquals('Person', MatrixAvroReader.schema(avroFile.toURI().toURL()).name) + assertEquals('Person', MatrixAvroReader.schema(content).name) + assertEquals('Person', MatrixAvroReader.schema(input).name) + assertFalse(input.closed) + assertEquals(readerSchema, MatrixAvroReader.schema(avroFile, AvroReadOptions.defaults().readerSchema(readerSchema))) + } + + @Test @Order(29) + void testReadFactoriesAndInputStreamOwnership() { + byte[] content = Files.readAllBytes(avroFile.toPath()) + def input = new TrackingInputStream(content) + + Matrix m = MatrixAvroReader.read(input, AvroReadOptions.named('NamedFactory')) + + assertBasicShapeAndValues(m) + assertEquals('NamedFactory', m.matrixName) + assertFalse(input.closed) + } @Test @Order(30) + void testNegativeLocalTimestampReads() { + Schema localSchema = new Schema.Parser().parse(''' + { + "type": "record", + "name": "NegativeLocalTimestamp", + "fields": [ + {"name":"millis", "type":{"type":"long","logicalType":"local-timestamp-millis"}}, + {"name":"micros", "type":{"type":"long","logicalType":"local-timestamp-micros"}} + ] + } + '''.stripIndent()) + + File file = Files.createTempFile('matrix-avro-negative-local-', '.avro').toFile() + try { + writeSingleRecord(file, localSchema, [millis: -1L, micros: -1L]) + + Matrix m = MatrixAvroReader.read(file) + + assertEquals(LocalDateTime.of(1969, 12, 31, 23, 59, 59, 999_000_000), m[0, 'millis']) + assertEquals(LocalDateTime.of(1969, 12, 31, 23, 59, 59, 999_999_000), m[0, 'micros']) + } finally { + file.delete() + } + } + + // ---------- custom exception tests ---------- + + @Test @Order(40) void testValidationExceptionForNullFile() { def ex = assertThrows(AvroValidationException) { MatrixAvroReader.read((File) null) @@ -316,7 +378,7 @@ class MatrixAvroReaderTest { assertTrue(ex.message.contains('cannot be null')) } - @Test @Order(31) + @Test @Order(41) void testValidationExceptionForNonExistentFile() { def ex = assertThrows(AvroValidationException) { MatrixAvroReader.read(new File('/non/existent/path.avro')) @@ -326,7 +388,7 @@ class MatrixAvroReaderTest { assertTrue(ex.message.contains('does not exist')) } - @Test @Order(32) + @Test @Order(42) void testValidationExceptionForDirectory() { File tempDir = Files.createTempDirectory('avro-test').toFile() try { @@ -341,7 +403,7 @@ class MatrixAvroReaderTest { } } - @Test @Order(33) + @Test @Order(43) void testValidationEmptyFileHandling() { File emptyFile = Files.createTempFile('avro-empty-', '.avro').toFile() try { @@ -355,7 +417,7 @@ class MatrixAvroReaderTest { } } - @Test @Order(34) + @Test @Order(44) void testValidationCorruptFileHandling() { File corruptFile = Files.createTempFile('avro-corrupt-', '.avro').toFile() Files.write(corruptFile.toPath(), 'not avro data'.bytes) @@ -404,6 +466,20 @@ class MatrixAvroReaderTest { } } + private static void writeSingleRecord(File outFile, Schema schema, Map values) { + def writer = new DataFileWriter(new GenericDatumWriter<>(schema)) + writer.create(schema, outFile) + try { + def rec = new GenericData.Record(schema) + values.each { String key, Object value -> + rec.put(key, value) + } + writer.append(rec) + } finally { + writer.close() + } + } + private static GenericRecord makeRecord(Schema schema, String name, int age, LocalDate birthday, Instant ts, @@ -447,4 +523,20 @@ class MatrixAvroReaderTest { assertTrue(m['price'][0] instanceof BigDecimal) } + private static final class TrackingInputStream extends ByteArrayInputStream implements Closeable { + + boolean closed + + private TrackingInputStream(byte[] bytes) { + super(bytes) + } + + @Override + void close() throws IOException { + closed = true + super.close() + } + + } + } diff --git a/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroWriterTest.groovy b/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroWriterTest.groovy index 21396d14..49c658f9 100644 --- a/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroWriterTest.groovy +++ b/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroWriterTest.groovy @@ -9,8 +9,10 @@ import org.apache.avro.generic.GenericDatumReader import org.apache.avro.generic.GenericRecord import org.junit.jupiter.api.Test +import se.alipsa.matrix.avro.AvroScalarTypeDecl import se.alipsa.matrix.avro.AvroSchemaDecl import se.alipsa.matrix.avro.AvroWriteOptions +import se.alipsa.matrix.avro.MatrixAvroReader import se.alipsa.matrix.avro.MatrixAvroWriter import se.alipsa.matrix.avro.exceptions.AvroSchemaException import se.alipsa.matrix.avro.exceptions.AvroValidationException @@ -20,6 +22,7 @@ import java.nio.file.Files import java.nio.file.Path import java.time.LocalDateTime import java.time.LocalTime +import java.time.ZoneOffset class MatrixAvroWriterTest { @@ -107,7 +110,7 @@ class MatrixAvroWriterTest { byte[] avroBytes = MatrixAvroWriter.writeBytes(original, true) // Read it back - Matrix result = se.alipsa.matrix.avro.MatrixAvroReader.read(avroBytes, 'RoundTrip') + Matrix result = MatrixAvroReader.read(avroBytes, 'RoundTrip') assertEquals(original.rowCount(), result.rowCount()) assertEquals(original.columnCount(), result.columnCount()) @@ -226,11 +229,26 @@ class MatrixAvroWriterTest { assertTrue(bytes.length > 0, 'Output stream should contain data') // Verify it can be read back - Matrix result = se.alipsa.matrix.avro.MatrixAvroReader.read(bytes, 'StreamTest') + Matrix result = MatrixAvroReader.read(bytes, 'StreamTest') assertEquals(3, result.rowCount()) assertEquals(2, result.columnCount()) } + @Test + void testWriteToOutputStreamDoesNotCloseCallerStream() { + Matrix m = Matrix.builder('OpenStream') + .columns(id: [1, 2]) + .types(Integer) + .build() + + def out = new TrackingOutputStream() + MatrixAvroWriter.write(m, out, false) + + assertFalse(out.closed) + out.write(1) + assertTrue(out.toByteArray().length > 0) + } + @Test void testWriteCreatesParentDirectory() { Map> cols = [:] @@ -389,7 +407,7 @@ class MatrixAvroWriterTest { "Compressed file (${compressed.length()}) should be smaller than uncompressed (${uncompressed.length()})") // Verify data can still be read correctly - Matrix result = se.alipsa.matrix.avro.MatrixAvroReader.read(compressed) + Matrix result = MatrixAvroReader.read(compressed) assertEquals(100, result.rowCount()) assertEquals(2, result.columnCount()) } finally { @@ -454,11 +472,68 @@ class MatrixAvroWriterTest { assertTrue(bytes.length > 0) // Verify round-trip - Matrix result = se.alipsa.matrix.avro.MatrixAvroReader.read(bytes) + Matrix result = MatrixAvroReader.read(bytes) assertEquals(2, result.rowCount()) assertEquals(123.45, result[0, 'value']) } + @Test + void testFactoryOptionsWriteExactDecimals() { + Matrix m = Matrix.builder('ExactDecimals') + .columns(amount: [12.34, 56.789]) + .types(BigDecimal) + .build() + + byte[] bytes = MatrixAvroWriter.writeExactDecimalBytes(m) + Schema amountSchema = nonNullFieldSchema(MatrixAvroReader.schema(bytes), 'amount') + + assertEquals(Schema.Type.BYTES, amountSchema.type) + assertEquals('decimal', amountSchema.logicalType.name) + assertTrue(AvroWriteOptions.exactDecimals().inferPrecisionAndScale) + assertFalse(AvroWriteOptions.defaults().inferPrecisionAndScale) + } + + @Test + void testBuildSchemaDoesNotReturnStaleSchemaForMutatedMatrix() { + Matrix m = Matrix.builder('MutableSchema') + .columns(value: [1, 2]) + .types(Object) + .build() + + Schema intSchema = MatrixAvroWriter.buildSchema(m, false) + m[0, 'value'] = Integer.MAX_VALUE + 1L + Schema longSchema = MatrixAvroWriter.buildSchema(m, false) + + assertNotSame(intSchema, longSchema) + assertEquals(Schema.Type.INT, nonNullFieldSchema(intSchema, 'value').type) + assertEquals(Schema.Type.LONG, nonNullFieldSchema(longSchema, 'value').type) + } + + @Test + void testLocalDateTimeTimestampMillisWritesAsUtc() { + TimeZone original = TimeZone.default + try { + Matrix m = Matrix.builder('UtcTimestamp') + .columns(ts: [LocalDateTime.of(2024, 1, 2, 3, 4, 5)]) + .types(LocalDateTime) + .build() + + AvroWriteOptions options = AvroWriteOptions.defaults() + .columnSchema('ts', AvroSchemaDecl.scalar(AvroScalarTypeDecl.TIMESTAMP_MILLIS)) + + TimeZone.default = TimeZone.getTimeZone('America/New_York') + byte[] newYorkBytes = MatrixAvroWriter.writeBytes(m, options) + TimeZone.default = TimeZone.getTimeZone('Asia/Tokyo') + byte[] tokyoBytes = MatrixAvroWriter.writeBytes(m, options) + + long expected = LocalDateTime.of(2024, 1, 2, 3, 4, 5).toInstant(ZoneOffset.UTC).toEpochMilli() + assertEquals(expected, rawLongFor(newYorkBytes, 'ts')) + assertEquals(expected, rawLongFor(tokyoBytes, 'ts')) + } finally { + TimeZone.default = original + } + } + @Test void testExplicitDecimalColumnSchemaOverridesInferenceDefaults() { Matrix m = Matrix.builder('ExplicitDecimal') @@ -514,6 +589,28 @@ class MatrixAvroWriterTest { } } + @Test + void testColumnSchemaConvenienceAliases() { + Matrix m = Matrix.builder('Aliases') + .columns(amount: [12.30], tags: [[1L, 2L]], props: [[x: 1]]) + .types(BigDecimal, List, Map) + .build() + + byte[] bytes = MatrixAvroWriter.writeBytes(m, AvroWriteOptions.defaults() + .columnSchema('amount', AvroSchemaDecl.decimalColumn(8, 2)) + .columnSchema('tags', AvroSchemaDecl.arrayOf(Long)) + .columnSchema('props', AvroSchemaDecl.mapOf(Integer))) + + Schema schema = MatrixAvroReader.schema(bytes) + assertEquals('decimal', nonNullFieldSchema(schema, 'amount').logicalType.name) + assertEquals(Schema.Type.LONG, nonNullFieldSchema(schema, 'tags').elementType.types.find { Schema it -> + it.type != Schema.Type.NULL + }.type) + assertEquals(Schema.Type.INT, nonNullFieldSchema(schema, 'props').valueType.types.find { Schema it -> + it.type != Schema.Type.NULL + }.type) + } + @Test void testColumnSchemaCanForceRecordEncoding() { Matrix m = Matrix.builder('ForceRecord') @@ -647,7 +744,7 @@ class MatrixAvroWriterTest { byte[] bytes = baos.toByteArray() assertTrue(bytes.length > 0) - Matrix result = se.alipsa.matrix.avro.MatrixAvroReader.read(bytes) + Matrix result = MatrixAvroReader.read(bytes) assertEquals(2, result.rowCount()) } @@ -761,4 +858,29 @@ class MatrixAvroWriterTest { return s } + private static long rawLongFor(byte[] bytes, String fieldName) { + def reader = new DataFileReader( + new org.apache.avro.file.SeekableByteArrayInput(bytes), + new GenericDatumReader<>() + ) + try { + GenericRecord record = reader.next() + return (Long) record.get(fieldName) + } finally { + reader.close() + } + } + + private static final class TrackingOutputStream extends ByteArrayOutputStream implements Closeable { + + boolean closed + + @Override + void close() throws IOException { + closed = true + super.close() + } + + } + } diff --git a/matrix-bom/bom.xml b/matrix-bom/bom.xml index b55d7996..3a28c216 100644 --- a/matrix-bom/bom.xml +++ b/matrix-bom/bom.xml @@ -33,7 +33,7 @@ 21 0.2.1 - 0.2.1 + 0.3.0 0.6.1 0.5.0-SNAPSHOT 3.7.1 From ea200da202eb5fa2e6ffdf35e41c8b11577aa33e Mon Sep 17 00:00:00 2001 From: per Date: Fri, 1 May 2026 17:22:41 +0200 Subject: [PATCH 2/2] Fix Avro timestamp micros compatibility --- .../matrix/avro/MatrixAvroWriter.groovy | 4 +++- .../matrix/avro/MatrixAvroWriterTest.groovy | 20 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroWriter.groovy b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroWriter.groovy index 88533136..4c615b74 100644 --- a/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroWriter.groovy +++ b/matrix-avro/src/main/groovy/se/alipsa/matrix/avro/MatrixAvroWriter.groovy @@ -983,8 +983,10 @@ class MatrixAvroWriter { return switch (name) { case 'date' -> LocalDate.isInstance(v) || java.sql.Date.isInstance(v) || Number.isInstance(v) case 'time-millis', 'time-micros' -> LocalTime.isInstance(v) || Time.isInstance(v) || Number.isInstance(v) - case 'timestamp-millis', 'timestamp-micros' -> + case 'timestamp-millis' -> Instant.isInstance(v) || Date.isInstance(v) || LocalDateTime.isInstance(v) || Number.isInstance(v) + case 'timestamp-micros' -> + Instant.isInstance(v) || Date.isInstance(v) || Number.isInstance(v) case 'local-timestamp-millis', 'local-timestamp-micros' -> LocalDateTime.isInstance(v) || Number.isInstance(v) case 'uuid' -> UUID.isInstance(v) || String.isInstance(v) case 'decimal' -> BigDecimal.isInstance(v) || Double.isInstance(v) || Float.isInstance(v) || diff --git a/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroWriterTest.groovy b/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroWriterTest.groovy index 49c658f9..2e874bc4 100644 --- a/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroWriterTest.groovy +++ b/matrix-avro/src/test/groovy/test/alipsa/matrix/avro/MatrixAvroWriterTest.groovy @@ -20,6 +20,7 @@ import se.alipsa.matrix.core.Matrix import java.nio.file.Files import java.nio.file.Path +import java.lang.reflect.Method import java.time.LocalDateTime import java.time.LocalTime import java.time.ZoneOffset @@ -521,6 +522,7 @@ class MatrixAvroWriterTest { AvroWriteOptions options = AvroWriteOptions.defaults() .columnSchema('ts', AvroSchemaDecl.scalar(AvroScalarTypeDecl.TIMESTAMP_MILLIS)) + // This test mutates JVM-global timezone state and assumes sequential test execution. TimeZone.default = TimeZone.getTimeZone('America/New_York') byte[] newYorkBytes = MatrixAvroWriter.writeBytes(m, options) TimeZone.default = TimeZone.getTimeZone('Asia/Tokyo') @@ -534,6 +536,18 @@ class MatrixAvroWriterTest { } } + @Test + void testLocalDateTimeCompatibilityOnlyForTimestampMillis() { + Schema timestampMillis = Schema.create(Schema.Type.LONG) + LogicalTypes.timestampMillis().addToSchema(timestampMillis) + Schema timestampMicros = Schema.create(Schema.Type.LONG) + LogicalTypes.timestampMicros().addToSchema(timestampMicros) + LocalDateTime value = LocalDateTime.of(2024, 1, 2, 3, 4, 5) + + assertTrue(isCompatible(timestampMillis, value)) + assertFalse(isCompatible(timestampMicros, value)) + } + @Test void testExplicitDecimalColumnSchemaOverridesInferenceDefaults() { Matrix m = Matrix.builder('ExplicitDecimal') @@ -871,6 +885,12 @@ class MatrixAvroWriterTest { } } + private static boolean isCompatible(Schema schema, Object value) { + Method method = MatrixAvroWriter.getDeclaredMethod('isCompatible', Schema, Object) + method.accessible = true + method.invoke(null, schema, value) as boolean + } + private static final class TrackingOutputStream extends ByteArrayOutputStream implements Closeable { boolean closed