From eac51eb652b963b268ef42da65fa22c2c73714f4 Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Mon, 18 May 2026 14:16:05 -0400 Subject: [PATCH 1/9] mysql-cdc: test for invalid date via TEXT COLUMN --- test/mysql-cdc/text-columns-timestamp.td | 97 ++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 test/mysql-cdc/text-columns-timestamp.td diff --git a/test/mysql-cdc/text-columns-timestamp.td b/test/mysql-cdc/text-columns-timestamp.td new file mode 100644 index 0000000000000..64ea60f15533b --- /dev/null +++ b/test/mysql-cdc/text-columns-timestamp.td @@ -0,0 +1,97 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# +# Regression test for MySQL TIMESTAMP columns ingested via TEXT COLUMNS, +# where the same upstream column contains a mix of valid timestamps and +# invalid zero-dates. TEXT COLUMNS is the documented workaround for +# columns that may hold values (e.g. '0000-00-00 00:00:00') that cannot +# be represented as a Materialize TIMESTAMP. +# +# The bug: MySQL TIMESTAMP values arrive as several mysql_common::Value +# variants depending on the protocol path (Value::Date for binlog events, +# Value::Bytes("") and Value::Int for some query/binlog +# combinations). The native TIMESTAMP decoder handles all three, but the +# TEXT COLUMNS branch in src/mysql-util/src/decoding.rs only handled +# Value::Date, so any valid TIMESTAMP routed through the Bytes/Int path +# failed with: +# +# error decoding value for '...' column '...': received unexpected +# value for timestamp type: Bytes("17436613..") +# +# and forced the source to be dropped and recreated. + +> CREATE SECRET mysqlpass AS '${arg.mysql-root-password}' + +> CREATE CONNECTION mysqc TO MYSQL ( + HOST mysql, + USER root, + PASSWORD SECRET mysqlpass + ) + +$ mysql-connect name=mysql url=mysql://root@mysql password=${arg.mysql-root-password} + +# sql_mode = '' is required so MySQL accepts the zero-date that motivates +# the use of TEXT COLUMNS in the first place. +$ mysql-execute name=mysql +DROP DATABASE IF EXISTS public; +CREATE DATABASE public; +USE public; +SET SESSION sql_mode = ''; +CREATE TABLE products (id INT PRIMARY KEY, created_at TIMESTAMP NULL, updated_at TIMESTAMP(6) NULL); +START TRANSACTION; +INSERT INTO products VALUES (1, '2024-04-03 10:15:13', '2024-04-03 10:15:13.123456'), (2, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000'), (3, NULL, NULL); +COMMIT; + +> BEGIN +> CREATE SOURCE da + FROM MYSQL CONNECTION mysqc; +> CREATE TABLE products FROM SOURCE da (REFERENCE public.products) + WITH (TEXT COLUMNS = (created_at, updated_at)); +> COMMIT + +# Post-snapshot rows exercise the replication / binlog decode path, +# which uses a different mysql_common::Value variant than the snapshot. +$ mysql-execute name=mysql +USE public; +SET SESSION sql_mode = ''; +START TRANSACTION; +INSERT INTO products VALUES (4, '2025-04-03 09:01:53', '2025-04-03 09:01:53.987654'), (5, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000'), (6, NULL, NULL); +COMMIT; + +> SELECT id, created_at FROM products ORDER BY id; +1 "2024-04-03 10:15:13" +2 "0000-00-00 00:00:00" +3 +4 "2025-04-03 09:01:53" +5 "0000-00-00 00:00:00" +6 + +> SELECT id, updated_at FROM products ORDER BY id; +1 "2024-04-03 10:15:13.123456" +2 "0000-00-00 00:00:00.000000" +3 +4 "2025-04-03 09:01:53.987654" +5 "0000-00-00 00:00:00.000000" +6 + +# Verify the column types were rewritten to text by TEXT COLUMNS. +> SELECT pg_typeof(created_at), pg_typeof(updated_at) FROM products LIMIT 1; +text text + +# The decode error stalls the source even after the queries above appear to +# succeed (the snapshot rows are already in dataflow and remain queryable). +# Asserting on the source status is what makes this a hard regression check. +# We poll until either the source is fully healthy or surfaces the stall; +# this avoids a race where the binlog error has not yet been recorded. +> SELECT name, status, error IS NULL FROM mz_internal.mz_source_statuses WHERE name IN ('da', 'products') ORDER BY name; +da running true +products running true + +> DROP SOURCE da CASCADE; From a52ea9b7e3b329562ed7449f7216b0206b29a985 Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Mon, 18 May 2026 14:55:28 -0400 Subject: [PATCH 2/9] mysql: fix handling of timestamps as TEXT COLUMN --- src/mysql-util/src/decoding.rs | 232 +++++++++++++++++++++++++++++---- 1 file changed, 207 insertions(+), 25 deletions(-) diff --git a/src/mysql-util/src/decoding.rs b/src/mysql-util/src/decoding.rs index 2c73bac879737..8431404bd3271 100644 --- a/src/mysql-util/src/decoding.rs +++ b/src/mysql-util/src/decoding.rs @@ -325,34 +325,71 @@ fn pack_val_as_datum( } } Some(MySqlColumnMeta::Timestamp(precision)) => { - // Some MySQL dates are invalid in chrono/NaiveDate (e.g. 0000-00-00), so - // we need to handle them directly as strings - if let Value::Date(y, m, d, h, mm, s, ms) = value { - if *precision > 0 { - let precision: usize = (*precision).try_into()?; - packer.push(Datum::String(&format!( - "{:04}-{:02}-{:02} {:02}:{:02}:{:02}.{:0precision$}", - y, - m, - d, - h, - mm, - s, - ms, - precision = precision - ))); - } else { - packer.push(Datum::String(&format!( - "{:04}-{:02}-{:02} {:02}:{:02}:{:02}", - y, m, d, h, mm, s - ))); + // TIMESTAMP arrives as three mysql_common::Value variants + // (refs: mysql_common v0.35.5): + // Value::Date — binary query response + binlog DATETIME[2] + // (value/mod.rs:443-445, binlog/value.rs:109-161) + // Value::Int — legacy binlog MYSQL_TYPE_TIMESTAMP, pre-5.6, + // 4-byte unix epoch (binlog/value.rs:87-90) + // Value::Bytes — binlog MYSQL_TYPE_TIMESTAMP2, 5.6+, + // "" or "." (binlog/value.rs:145-154) + let str_timestamp = match value { + Value::Date(y, m, d, h, mm, s, ms) => { + if *precision > 0 { + let precision: usize = (*precision).try_into()?; + format!( + "{:04}-{:02}-{:02} {:02}:{:02}:{:02}.{:0precision$}", + y, + m, + d, + h, + mm, + s, + ms, + precision = precision + ) + } else { + format!( + "{:04}-{:02}-{:02} {:02}:{:02}:{:02}", + y, m, d, h, mm, s + ) + } } - } else { - Err(anyhow::anyhow!( + // Pre-5.6 unix epoch, no fractional seconds. + Value::Int(val) => chrono::DateTime::from_timestamp(val, 0) + .ok_or_else(|| { + anyhow::anyhow!("received invalid timestamp value: {}", val) + })? + .naive_utc() + .format("%Y-%m-%d %H:%M:%S") + .to_string(), + // 5.6+ epoch string; parse + reformat so all variants emit the + // same canonical YYYY-MM-DD HH:MM:SS[.ffff] text. + Value::Bytes(data) => { + let s = std::str::from_utf8(&data).map_err(|_| { + anyhow::anyhow!("received invalid timestamp value: {:?}", data) + })?; + let dt = if s.contains('.') { + chrono::NaiveDateTime::parse_from_str(s, "%s%.6f") + } else { + chrono::NaiveDateTime::parse_from_str(s, "%s") + } + .map_err(|_| { + anyhow::anyhow!("received invalid timestamp value: {:?}", s) + })?; + if *precision > 0 { + let p: usize = (*precision).try_into()?; + dt.format(&format!("%Y-%m-%d %H:%M:%S.%{p}f")).to_string() + } else { + dt.format("%Y-%m-%d %H:%M:%S").to_string() + } + } + _ => Err(anyhow::anyhow!( "received unexpected value for timestamp type: {:?}", value - ))?; - } + ))?, + }; + packer.push(Datum::String(&str_timestamp)); } Some(MySqlColumnMeta::Bit(_)) => unreachable!("parsed as a u64"), None => { @@ -481,3 +518,148 @@ fn check_char_length( } Ok(()) } + +#[cfg(test)] +mod tests { + //! Unit tests for the TEXT-COLUMNS decoding of MySQL TIMESTAMP values. + //! + //! These cover the regression where a MySQL TIMESTAMP column declared as + //! a TEXT COLUMN fails to decode when the wire value arrives as + //! `Value::Bytes("")` or `Value::Int()` instead + //! of `Value::Date(..)`. The integration test in + //! `test/mysql-cdc/text-columns-timestamp.td` exercises this through + //! a real MySQL container but is non-deterministic: which `Value` + //! variant `mysql-async` produces depends on connection-state timing. + //! These unit tests pin each variant down directly. + //! + //! The wire-variant matrix exercised below is derived from mysql_common + //! v0.35.5: + //! + //! * Value::Int(epoch) — binlog MYSQL_TYPE_TIMESTAMP (pre-5.6): + //! https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/binlog/value.rs#L87-L90 + //! * Value::Bytes(""/".") — binlog MYSQL_TYPE_TIMESTAMP2 (5.6+): + //! https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/binlog/value.rs#L145-L154 + //! * Value::Date(...) — binary query response + binlog DATETIME[2]: + //! https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/value/mod.rs#L443-L445 + //! https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/binlog/value.rs#L109-L161 + //! + //! MySQL semantics referenced by the zero-date and fractional-precision + //! cases: + //! + //! * Zero-date allowed when sql_mode disables NO_ZERO_DATE: + //! https://dev.mysql.com/doc/refman/8.0/en/sql-mode.html#sqlmode_no_zero_date + //! * TIMESTAMP(p) / DATETIME(p) fractional seconds: + //! https://dev.mysql.com/doc/refman/8.0/en/fractional-seconds.html + use super::*; + use mz_repr::{SqlColumnType, SqlScalarType}; + + fn timestamp_text_col(precision: u32) -> MySqlColumnDesc { + MySqlColumnDesc { + name: "created_at".to_string(), + column_type: Some(SqlColumnType { + scalar_type: SqlScalarType::String, + nullable: true, + }), + meta: Some(MySqlColumnMeta::Timestamp(precision)), + } + } + + fn pack_one(value: Value, col: &MySqlColumnDesc) -> Result { + let mut row = Row::default(); + pack_val_as_datum(value, col, &mut row.packer())?; + Ok(row.unpack_first().unwrap_str().to_string()) + } + + #[mz_ore::test] + fn timestamp_value_date_no_precision() { + let col = timestamp_text_col(0); + let s = pack_one(Value::Date(2024, 4, 3, 10, 15, 13, 0), &col).unwrap(); + assert_eq!(s, "2024-04-03 10:15:13"); + } + + #[mz_ore::test] + fn timestamp_value_date_with_precision() { + let col = timestamp_text_col(6); + let s = pack_one(Value::Date(2024, 4, 3, 10, 15, 13, 123456), &col).unwrap(); + assert_eq!(s, "2024-04-03 10:15:13.123456"); + } + + #[mz_ore::test] + fn timestamp_value_date_zero_date() { + // The whole reason TEXT COLUMNS exists for TIMESTAMP: a + // zero-date arriving as Value::Date(0,..) should round-trip as + // the literal MySQL zero-timestamp string. + let col = timestamp_text_col(0); + let s = pack_one(Value::Date(0, 0, 0, 0, 0, 0, 0), &col).unwrap(); + assert_eq!(s, "0000-00-00 00:00:00"); + } + + /// Regression: Value::Int (pre-5.6 legacy temporal format, unix + /// epoch seconds) was previously rejected with + /// `received unexpected value for timestamp type: Int(..)`. + #[mz_ore::test] + fn timestamp_value_int_epoch() { + let col = timestamp_text_col(0); + // 1743661234 == 2025-04-03 06:20:34 UTC + let s = pack_one(Value::Int(1_743_661_234), &col).unwrap(); + assert_eq!(s, "2025-04-03 06:20:34"); + } + + #[mz_ore::test] + fn timestamp_value_int_epoch_zero() { + // Unix epoch 0; legacy format has no fractional seconds. + let col = timestamp_text_col(0); + let s = pack_one(Value::Int(0), &col).unwrap(); + assert_eq!(s, "1970-01-01 00:00:00"); + } + + /// Regression: Value::Bytes carrying a unix-epoch string is the + /// wire variant that triggered the production failure + /// received unexpected value for timestamp type: Bytes("17436613..") + #[mz_ore::test] + fn timestamp_value_bytes_epoch() { + let col = timestamp_text_col(0); + let s = pack_one(Value::Bytes(b"1743661234".to_vec()), &col).unwrap(); + assert_eq!(s, "2025-04-03 06:20:34"); + } + + /// Regression: the zero-date can also surface as Value::Bytes("0") + /// from the binlog replication path; this was the variant the + /// local integration test triggered most often. + #[mz_ore::test] + fn timestamp_value_bytes_zero() { + let col = timestamp_text_col(0); + let s = pack_one(Value::Bytes(b"0".to_vec()), &col).unwrap(); + // Treat literal "0" as the unix epoch, matching the non-TEXT + // path's behavior at `SqlScalarType::Timestamp` above. + assert_eq!(s, "1970-01-01 00:00:00"); + } + + /// Bytes that aren't valid UTF-8 should produce a meaningful error, + /// not a panic. + #[mz_ore::test] + fn timestamp_value_bytes_invalid_utf8_errors() { + let col = timestamp_text_col(0); + // 0xC3 0x28 is an invalid 2-byte UTF-8 sequence. + let err = pack_one(Value::Bytes(vec![0xC3, 0x28]), &col).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("invalid timestamp value"), + "unexpected error message: {msg}" + ); + } + + /// Variants that have no defined mapping for a TIMESTAMP column + /// must still produce the existing structured decode error so the + /// source health surface can flag them. + #[mz_ore::test] + fn timestamp_value_unsupported_variant_errors() { + let col = timestamp_text_col(0); + let err = pack_one(Value::Float(1.0), &col).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("unexpected value for timestamp"), + "unexpected error message: {msg}" + ); + } +} From 85f4ec0e2bb46034286b159fd167ce26c3f5aadc Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Mon, 18 May 2026 17:05:02 -0400 Subject: [PATCH 3/9] Some additional corner cases --- src/mysql-util/src/decoding.rs | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/mysql-util/src/decoding.rs b/src/mysql-util/src/decoding.rs index 8431404bd3271..adecbc925d400 100644 --- a/src/mysql-util/src/decoding.rs +++ b/src/mysql-util/src/decoding.rs @@ -613,6 +613,19 @@ mod tests { assert_eq!(s, "1970-01-01 00:00:00"); } + /// Out-of-range epochs must error rather than silently producing + /// a zero-timestamp — they aren't the MySQL zero-date marker, just + /// garbage chrono can't represent. + #[mz_ore::test] + fn timestamp_value_int_out_of_range_errors() { + let col = timestamp_text_col(0); + let err = pack_one(Value::Int(i64::MAX), &col).unwrap_err(); + assert!( + err.to_string().contains("invalid timestamp value"), + "unexpected error message: {err}" + ); + } + /// Regression: Value::Bytes carrying a unix-epoch string is the /// wire variant that triggered the production failure /// received unexpected value for timestamp type: Bytes("17436613..") @@ -635,6 +648,17 @@ mod tests { assert_eq!(s, "1970-01-01 00:00:00"); } + /// Fractional form of the TIMESTAMP2 binlog encoding — + /// "." wrapped in Value::Bytes (binlog/value.rs:151-153). + /// Hits the `s.contains('.')` branch and the precision-aware + /// reformat. + #[mz_ore::test] + fn timestamp_value_bytes_epoch_fractional() { + let col = timestamp_text_col(6); + let s = pack_one(Value::Bytes(b"1743661234.123456".to_vec()), &col).unwrap(); + assert_eq!(s, "2025-04-03 06:20:34.123456"); + } + /// Bytes that aren't valid UTF-8 should produce a meaningful error, /// not a panic. #[mz_ore::test] @@ -649,6 +673,22 @@ mod tests { ); } + /// Bytes that are valid UTF-8 but not parseable as a unix epoch + /// should produce the same structured error as invalid UTF-8 — + /// covers the chrono parse failure path that + /// `timestamp_value_bytes_invalid_utf8_errors` doesn't reach. + #[mz_ore::test] + fn timestamp_value_bytes_unparseable_errors() { + let col = timestamp_text_col(0); + for payload in [&b""[..], &b"not-an-epoch"[..], &b"2024-04-03 10:15:13"[..]] { + let err = pack_one(Value::Bytes(payload.to_vec()), &col).unwrap_err(); + assert!( + err.to_string().contains("invalid timestamp value"), + "payload {payload:?}: unexpected error message: {err}" + ); + } + } + /// Variants that have no defined mapping for a TIMESTAMP column /// must still produce the existing structured decode error so the /// source health surface can flag them. From 4ee8063d3cf98e5f95dbc1e80d376fac689766cc Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Tue, 19 May 2026 10:00:01 -0400 Subject: [PATCH 4/9] Ensure snapshot and binlog decoding of timestamps agree --- src/mysql-util/src/decoding.rs | 116 +++++++++++++++++------ test/mysql-cdc/text-columns-timestamp.td | 77 ++++++++++----- 2 files changed, 144 insertions(+), 49 deletions(-) diff --git a/src/mysql-util/src/decoding.rs b/src/mysql-util/src/decoding.rs index adecbc925d400..fda34e54f838c 100644 --- a/src/mysql-util/src/decoding.rs +++ b/src/mysql-util/src/decoding.rs @@ -24,6 +24,27 @@ use mz_repr::{Datum, Row, RowPacker, SqlScalarType}; use crate::desc::MySqlColumnMeta; use crate::{MySqlColumnDesc, MySqlError, MySqlTableDesc}; +/// Canonical text for the MySQL zero-date sentinel ('0000-00-00 00:00:00'). +/// In binlog MYSQL_TYPE_DATETIME/MYSQL_TYPE_DATETIME2 encodings, sec=0 *cannot* represent unix +/// epoch 0. The TIMESTAMP type's supported range starts at '1970-01-01 00:00:01' UTC +/// (), so any sec=0 is +/// unambiguously this sentinel. +const MYSQL_ZERO_TIMESTAMP: &str = "0000-00-00 00:00:00"; + +/// Format the zero-date sentinel for a column with the given fractional +/// precision (matches the Date arm's `{:0precision$}` behavior). +fn mysql_zero_timestamp(precision: u32) -> String { + if precision > 0 { + format!( + "{}.{}", + MYSQL_ZERO_TIMESTAMP, + "0".repeat(usize::cast_from(precision)) + ) + } else { + MYSQL_ZERO_TIMESTAMP.to_string() + } +} + pub fn pack_mysql_row( row_container: &mut Row, row: MySqlRow, @@ -325,9 +346,15 @@ fn pack_val_as_datum( } } Some(MySqlColumnMeta::Timestamp(precision)) => { - // TIMESTAMP arrives as three mysql_common::Value variants - // (refs: mysql_common v0.35.5): - // Value::Date — binary query response + binlog DATETIME[2] + // Materialize treats DATETIME and TIMESTAMP as MySqlColumnMeta::Timestamp, + // but they have slightly different semantics as far as the range of dates + // they can represent. + // (see https://dev.mysql.com/doc/refman/8.0/en/date-and-time-types.html). + // + // Three mysql_common::Value variants exist, which are mapped to + // [`MySqlColumnMeta::Timestamp`] + // (see https://github.com/blackbeam/rust_mysql_common/blob/2e6f6696de03c91b9fd95a87356d081285290704/src/binlog/value.rs): + // Value::Date — MZ snapshot & binlog MYSQL_TYPE_DATETIME/MYSQL_TYPE_DATETIME2 // (value/mod.rs:443-445, binlog/value.rs:109-161) // Value::Int — legacy binlog MYSQL_TYPE_TIMESTAMP, pre-5.6, // 4-byte unix epoch (binlog/value.rs:87-90) @@ -356,6 +383,8 @@ fn pack_val_as_datum( } } // Pre-5.6 unix epoch, no fractional seconds. + // val == 0 is the zero-date sentinel, not epoch 0. + Value::Int(0) => mysql_zero_timestamp(*precision), Value::Int(val) => chrono::DateTime::from_timestamp(val, 0) .ok_or_else(|| { anyhow::anyhow!("received invalid timestamp value: {}", val) @@ -369,19 +398,25 @@ fn pack_val_as_datum( let s = std::str::from_utf8(&data).map_err(|_| { anyhow::anyhow!("received invalid timestamp value: {:?}", data) })?; - let dt = if s.contains('.') { - chrono::NaiveDateTime::parse_from_str(s, "%s%.6f") - } else { - chrono::NaiveDateTime::parse_from_str(s, "%s") - } - .map_err(|_| { - anyhow::anyhow!("received invalid timestamp value: {:?}", s) - })?; - if *precision > 0 { - let p: usize = (*precision).try_into()?; - dt.format(&format!("%Y-%m-%d %H:%M:%S.%{p}f")).to_string() + // sec=0 (with or without fractional component) is the + // zero-date sentinel. + if s.split('.').next() == Some("0") { + mysql_zero_timestamp(*precision) } else { - dt.format("%Y-%m-%d %H:%M:%S").to_string() + let dt = if s.contains('.') { + chrono::NaiveDateTime::parse_from_str(s, "%s%.6f") + } else { + chrono::NaiveDateTime::parse_from_str(s, "%s") + } + .map_err(|_| { + anyhow::anyhow!("received invalid timestamp value: {:?}", s) + })?; + if *precision > 0 { + let p: usize = (*precision).try_into()?; + dt.format(&format!("%Y-%m-%d %H:%M:%S.%{p}f")).to_string() + } else { + dt.format("%Y-%m-%d %H:%M:%S").to_string() + } } } _ => Err(anyhow::anyhow!( @@ -433,8 +468,8 @@ fn pack_val_as_datum( // Timestamps are encoded as different mysql_common::Value types depending on // whether they are from a binlog event or a query, and depending on which // mysql timestamp version is used. We handle those cases here - // https://github.com/blackbeam/rust_mysql_common/blob/v0.31.0/src/binlog/value.rs#L87-L155 - // https://github.com/blackbeam/rust_mysql_common/blob/v0.31.0/src/value/mod.rs#L332 + // https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/binlog/value.rs#L87-L155 + // https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/value/mod.rs#L332 let chrono_timestamp = match value { Value::Date(..) => from_value_opt::(value)?, // old temporal format from before MySQL 5.6; didn't support fractional seconds @@ -605,12 +640,24 @@ mod tests { assert_eq!(s, "2025-04-03 06:20:34"); } + /// sec=0 in the legacy TIMESTAMP encoding is the zero-date sentinel, + /// not unix epoch 0 — TIMESTAMP's range starts at '1970-01-01 00:00:01' + /// UTC so epoch 0 isn't a representable column value. #[mz_ore::test] - fn timestamp_value_int_epoch_zero() { - // Unix epoch 0; legacy format has no fractional seconds. + fn timestamp_value_int_zero_is_sentinel() { let col = timestamp_text_col(0); let s = pack_one(Value::Int(0), &col).unwrap(); - assert_eq!(s, "1970-01-01 00:00:00"); + assert_eq!(s, "0000-00-00 00:00:00"); + } + + /// Zero-date pads to the column's fractional precision so that the + /// snapshot (Date arm) and binlog (Int/Bytes arms) produce identical + /// text for the same upstream row. + #[mz_ore::test] + fn timestamp_value_int_zero_with_precision() { + let col = timestamp_text_col(6); + let s = pack_one(Value::Int(0), &col).unwrap(); + assert_eq!(s, "0000-00-00 00:00:00.000000"); } /// Out-of-range epochs must error rather than silently producing @@ -636,16 +683,31 @@ mod tests { assert_eq!(s, "2025-04-03 06:20:34"); } - /// Regression: the zero-date can also surface as Value::Bytes("0") - /// from the binlog replication path; this was the variant the - /// local integration test triggered most often. + /// sec=0 in the TIMESTAMP2 encoding is the zero-date sentinel; same + /// reasoning as `timestamp_value_int_zero_is_sentinel`. #[mz_ore::test] - fn timestamp_value_bytes_zero() { + fn timestamp_value_bytes_zero_is_sentinel() { let col = timestamp_text_col(0); let s = pack_one(Value::Bytes(b"0".to_vec()), &col).unwrap(); - // Treat literal "0" as the unix epoch, matching the non-TEXT - // path's behavior at `SqlScalarType::Timestamp` above. - assert_eq!(s, "1970-01-01 00:00:00"); + assert_eq!(s, "0000-00-00 00:00:00"); + } + + #[mz_ore::test] + fn timestamp_value_bytes_zero_with_precision() { + let col = timestamp_text_col(6); + let s = pack_one(Value::Bytes(b"0".to_vec()), &col).unwrap(); + assert_eq!(s, "0000-00-00 00:00:00.000000"); + } + + /// Defensively handle a hypothetical "0.NNNNNN" form (TIMESTAMP2 + /// would only emit this if the stored microsecond component were + /// non-zero, which the upstream type doesn't actually allow for + /// sec=0, but the sentinel check should still fire). + #[mz_ore::test] + fn timestamp_value_bytes_zero_with_fractional_is_sentinel() { + let col = timestamp_text_col(6); + let s = pack_one(Value::Bytes(b"0.000000".to_vec()), &col).unwrap(); + assert_eq!(s, "0000-00-00 00:00:00.000000"); } /// Fractional form of the TIMESTAMP2 binlog encoding — diff --git a/test/mysql-cdc/text-columns-timestamp.td b/test/mysql-cdc/text-columns-timestamp.td index 64ea60f15533b..5b1a222e54119 100644 --- a/test/mysql-cdc/text-columns-timestamp.td +++ b/test/mysql-cdc/text-columns-timestamp.td @@ -8,24 +8,19 @@ # by the Apache License, Version 2.0. # -# Regression test for MySQL TIMESTAMP columns ingested via TEXT COLUMNS, -# where the same upstream column contains a mix of valid timestamps and -# invalid zero-dates. TEXT COLUMNS is the documented workaround for -# columns that may hold values (e.g. '0000-00-00 00:00:00') that cannot -# be represented as a Materialize TIMESTAMP. +# Regression test for MySQL TIMESTAMP and DATETIME columns ingested via +# TEXT COLUMNS, with a mix of valid values, zero-dates, and out-of-range +# values. TEXT COLUMNS is the documented workaround for values that +# cannot be represented as a Materialize TIMESTAMP (notably +# '0000-00-00 00:00:00' and DATETIME values outside the TIMESTAMP range). # -# The bug: MySQL TIMESTAMP values arrive as several mysql_common::Value -# variants depending on the protocol path (Value::Date for binlog events, -# Value::Bytes("") and Value::Int for some query/binlog -# combinations). The native TIMESTAMP decoder handles all three, but the -# TEXT COLUMNS branch in src/mysql-util/src/decoding.rs only handled -# Value::Date, so any valid TIMESTAMP routed through the Bytes/Int path -# failed with: +# Per https://dev.mysql.com/doc/refman/8.0/en/datetime.html: +# * DATETIME range: '1000-01-01 00:00:00.000000' to '9999-12-31 23:59:59.499999' +# * TIMESTAMP range: '1970-01-01 00:00:01.000000' to '2038-01-19 03:14:07.499999' # -# error decoding value for '...' column '...': received unexpected -# value for timestamp type: Bytes("17436613..") -# -# and forced the source to be dropped and recreated. +# The DATETIME rows here exercise values that are valid DATETIMEs but +# outside the TIMESTAMP range (year 1001, year 9999), so they can only +# be represented as text. > CREATE SECRET mysqlpass AS '${arg.mysql-root-password}' @@ -44,16 +39,21 @@ DROP DATABASE IF EXISTS public; CREATE DATABASE public; USE public; SET SESSION sql_mode = ''; -CREATE TABLE products (id INT PRIMARY KEY, created_at TIMESTAMP NULL, updated_at TIMESTAMP(6) NULL); +CREATE TABLE products (id INT PRIMARY KEY, created_at TIMESTAMP NULL, updated_at TIMESTAMP(6) NULL, archived_at DATETIME NULL, born_at DATETIME(6) NULL); START TRANSACTION; -INSERT INTO products VALUES (1, '2024-04-03 10:15:13', '2024-04-03 10:15:13.123456'), (2, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000'), (3, NULL, NULL); +INSERT INTO products VALUES (1, '2024-04-03 10:15:13', '2024-04-03 10:15:13.123456', '2024-04-03 10:15:13', '2024-04-03 10:15:13.123456'), (2, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000'), (3, NULL, NULL, NULL, NULL); +# Rows 7-8: DATETIME values that are outside the TIMESTAMP range +# (year 1001 is before TIMESTAMP's 1970 start; year 9999 is well beyond +# its 2038 end). TIMESTAMP columns must be NULL — these values can't +# be stored there at all. +INSERT INTO products VALUES (7, NULL, NULL, '1001-01-01 00:00:00', '1001-01-01 00:00:00.000001'), (8, NULL, NULL, '9999-12-31 23:59:59', '9999-12-31 23:59:59.999999'); COMMIT; > BEGIN > CREATE SOURCE da FROM MYSQL CONNECTION mysqc; > CREATE TABLE products FROM SOURCE da (REFERENCE public.products) - WITH (TEXT COLUMNS = (created_at, updated_at)); + WITH (TEXT COLUMNS = (created_at, updated_at, archived_at, born_at)); > COMMIT # Post-snapshot rows exercise the replication / binlog decode path, @@ -62,7 +62,8 @@ $ mysql-execute name=mysql USE public; SET SESSION sql_mode = ''; START TRANSACTION; -INSERT INTO products VALUES (4, '2025-04-03 09:01:53', '2025-04-03 09:01:53.987654'), (5, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000'), (6, NULL, NULL); +INSERT INTO products VALUES (4, '2025-04-03 09:01:53', '2025-04-03 09:01:53.987654', '2025-04-03 09:01:53', '2025-04-03 09:01:53.987654'), (5, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000'), (6, NULL, NULL, NULL, NULL); +INSERT INTO products VALUES (9, NULL, NULL, '1001-01-01 00:00:00', '1001-01-01 00:00:00.000001'), (10, NULL, NULL, '9999-12-31 23:59:59', '9999-12-31 23:59:59.999999'); COMMIT; > SELECT id, created_at FROM products ORDER BY id; @@ -72,6 +73,10 @@ COMMIT; 4 "2025-04-03 09:01:53" 5 "0000-00-00 00:00:00" 6 +7 +8 +9 +10 > SELECT id, updated_at FROM products ORDER BY id; 1 "2024-04-03 10:15:13.123456" @@ -80,10 +85,38 @@ COMMIT; 4 "2025-04-03 09:01:53.987654" 5 "0000-00-00 00:00:00.000000" 6 +7 +8 +9 +10 + +> SELECT id, archived_at FROM products ORDER BY id; +1 "2024-04-03 10:15:13" +2 "0000-00-00 00:00:00" +3 +4 "2025-04-03 09:01:53" +5 "0000-00-00 00:00:00" +6 +7 "1001-01-01 00:00:00" +8 "9999-12-31 23:59:59" +9 "1001-01-01 00:00:00" +10 "9999-12-31 23:59:59" + +> SELECT id, born_at FROM products ORDER BY id; +1 "2024-04-03 10:15:13.123456" +2 "0000-00-00 00:00:00.000000" +3 +4 "2025-04-03 09:01:53.987654" +5 "0000-00-00 00:00:00.000000" +6 +7 "1001-01-01 00:00:00.000001" +8 "9999-12-31 23:59:59.999999" +9 "1001-01-01 00:00:00.000001" +10 "9999-12-31 23:59:59.999999" # Verify the column types were rewritten to text by TEXT COLUMNS. -> SELECT pg_typeof(created_at), pg_typeof(updated_at) FROM products LIMIT 1; -text text +> SELECT pg_typeof(created_at), pg_typeof(updated_at), pg_typeof(archived_at), pg_typeof(born_at) FROM products LIMIT 1; +text text text text # The decode error stalls the source even after the queries above appear to # succeed (the snapshot rows are already in dataflow and remain queryable). From 406d5f18f2bfc013d3b34e5049f2c772f65bbf03 Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Tue, 19 May 2026 14:02:41 -0400 Subject: [PATCH 5/9] Remove duplicate unit tests --- src/mysql-util/src/decoding.rs | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/src/mysql-util/src/decoding.rs b/src/mysql-util/src/decoding.rs index fda34e54f838c..96b7b9cebb447 100644 --- a/src/mysql-util/src/decoding.rs +++ b/src/mysql-util/src/decoding.rs @@ -650,16 +650,6 @@ mod tests { assert_eq!(s, "0000-00-00 00:00:00"); } - /// Zero-date pads to the column's fractional precision so that the - /// snapshot (Date arm) and binlog (Int/Bytes arms) produce identical - /// text for the same upstream row. - #[mz_ore::test] - fn timestamp_value_int_zero_with_precision() { - let col = timestamp_text_col(6); - let s = pack_one(Value::Int(0), &col).unwrap(); - assert_eq!(s, "0000-00-00 00:00:00.000000"); - } - /// Out-of-range epochs must error rather than silently producing /// a zero-timestamp — they aren't the MySQL zero-date marker, just /// garbage chrono can't represent. @@ -692,17 +682,10 @@ mod tests { assert_eq!(s, "0000-00-00 00:00:00"); } - #[mz_ore::test] - fn timestamp_value_bytes_zero_with_precision() { - let col = timestamp_text_col(6); - let s = pack_one(Value::Bytes(b"0".to_vec()), &col).unwrap(); - assert_eq!(s, "0000-00-00 00:00:00.000000"); - } - - /// Defensively handle a hypothetical "0.NNNNNN" form (TIMESTAMP2 - /// would only emit this if the stored microsecond component were - /// non-zero, which the upstream type doesn't actually allow for - /// sec=0, but the sentinel check should still fire). + /// Sentinel detection survives a fractional component ("0.NNNNNN"), + /// and the helper pads the output to the column's precision so that + /// snapshot and binlog paths produce identical text for the same + /// upstream row. #[mz_ore::test] fn timestamp_value_bytes_zero_with_fractional_is_sentinel() { let col = timestamp_text_col(6); From 1b908270d1d2aac800c4dd1ef1ffac74b8de16b8 Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Tue, 19 May 2026 21:59:49 -0400 Subject: [PATCH 6/9] Address PR comments --- src/mysql-util/src/decoding.rs | 6 ++++-- test/mysql-cdc/text-columns-timestamp.td | 8 +++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/mysql-util/src/decoding.rs b/src/mysql-util/src/decoding.rs index 96b7b9cebb447..fd1fdff749509 100644 --- a/src/mysql-util/src/decoding.rs +++ b/src/mysql-util/src/decoding.rs @@ -622,8 +622,8 @@ mod tests { #[mz_ore::test] fn timestamp_value_date_zero_date() { // The whole reason TEXT COLUMNS exists for TIMESTAMP: a - // zero-date arriving as Value::Date(0,..) should round-trip as - // the literal MySQL zero-timestamp string. + // zero-date arriving as Value::Date(0,..) should decode to the same + // "zero" timestamp value MySQL would display. let col = timestamp_text_col(0); let s = pack_one(Value::Date(0, 0, 0, 0, 0, 0, 0), &col).unwrap(); assert_eq!(s, "0000-00-00 00:00:00"); @@ -725,6 +725,8 @@ mod tests { #[mz_ore::test] fn timestamp_value_bytes_unparseable_errors() { let col = timestamp_text_col(0); + // "2024-04-03 10:15:13" is not valid because Value::Bytes must contain seconds since + // epoch, Value::Bytes(""/".") for payload in [&b""[..], &b"not-an-epoch"[..], &b"2024-04-03 10:15:13"[..]] { let err = pack_one(Value::Bytes(payload.to_vec()), &col).unwrap_err(); assert!( diff --git a/test/mysql-cdc/text-columns-timestamp.td b/test/mysql-cdc/text-columns-timestamp.td index 5b1a222e54119..4023151d7895b 100644 --- a/test/mysql-cdc/text-columns-timestamp.td +++ b/test/mysql-cdc/text-columns-timestamp.td @@ -118,11 +118,9 @@ COMMIT; > SELECT pg_typeof(created_at), pg_typeof(updated_at), pg_typeof(archived_at), pg_typeof(born_at) FROM products LIMIT 1; text text text text -# The decode error stalls the source even after the queries above appear to -# succeed (the snapshot rows are already in dataflow and remain queryable). -# Asserting on the source status is what makes this a hard regression check. -# We poll until either the source is fully healthy or surfaces the stall; -# this avoids a race where the binlog error has not yet been recorded. +# None of the data above should have caused the source to go into a stalled state. This may be +# slight overkill as a check. We've verified that all the rows are readable in previous +# queries above. > SELECT name, status, error IS NULL FROM mz_internal.mz_source_statuses WHERE name IN ('da', 'products') ORDER BY name; da running true products running true From 4deeebd536b0bffd4a996debce2ccc0d4f3de28d Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Tue, 19 May 2026 22:34:12 -0400 Subject: [PATCH 7/9] split out and extend DATE using TEXT COLUMN tests --- test/mysql-cdc/30-text-columns.td | 27 ++--- test/mysql-cdc/text-columns-date.td | 150 ++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 19 deletions(-) create mode 100644 test/mysql-cdc/text-columns-date.td diff --git a/test/mysql-cdc/30-text-columns.td b/test/mysql-cdc/30-text-columns.td index f2cc2bb91afa7..33cff4e38e6e6 100644 --- a/test/mysql-cdc/30-text-columns.td +++ b/test/mysql-cdc/30-text-columns.td @@ -26,20 +26,21 @@ $ mysql-connect name=mysql url=mysql://root@mysql password=${arg.mysql-root-pass # as a TEXT COLUMN # NOTE: We need to use `sql_mode = ''` to have MySQL allow the 0000-00-00 dates which it # correctly disallows by default in newer versions, but used to allow in previous ones. +# DATE-type TEXT COLUMNS coverage lives in text-columns-date.td. $ mysql-execute name=mysql DROP DATABASE IF EXISTS public; CREATE DATABASE public; USE public; -CREATE TABLE t1 (f1 JSON, f2 ENUM('small', 'medium', 'large'), f3 YEAR, f4 DATE, f5 DATE, f6 DATE, f7 DATETIME, f8 DATETIME, f9 DATETIME(4)); +CREATE TABLE t1 (f1 JSON, f2 ENUM('small', 'medium', 'large'), f3 YEAR, f4 DATETIME, f5 DATETIME, f6 DATETIME(4)); SET SESSION sql_mode = ''; -INSERT INTO t1 VALUES (CAST('{"bar": "baz", "balance": 7.77, "active": false, "nest": {"birds": ["seagull", "robin"]}}' AS JSON), 'large', 2024, '0000-00-00', '2024-00-01', '2024-01-00', '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00.000000'); +INSERT INTO t1 VALUES (CAST('{"bar": "baz", "balance": 7.77, "active": false, "nest": {"birds": ["seagull", "robin"]}}' AS JSON), 'large', 2024, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00.000000'); > BEGIN > CREATE SOURCE da FROM MYSQL CONNECTION mysqc; > CREATE TABLE t1 FROM SOURCE da (REFERENCE public.t1) - WITH (TEXT COLUMNS = (f1, f2, f3, f4, f5, f6, f7, f8, f9)); + WITH (TEXT COLUMNS = (f1, f2, f3, f4, f5, f6)); > COMMIT # Insert the same data post-snapshot @@ -66,36 +67,24 @@ INSERT INTO t1 SELECT * FROM t1; "{\"bar\":\"baz\",\"nest\":{\"birds\":[\"seagull\",\"robin\"]},\"active\":false,\"balance\":7.77}" > SELECT f4 FROM t1; -0000-00-00 -0000-00-00 - -> SELECT f5 FROM t1; -2024-00-01 -2024-00-01 - -> SELECT f6 FROM t1; -2024-01-00 -2024-01-00 - -> SELECT f7 FROM t1; "0000-00-00 00:00:00" "0000-00-00 00:00:00" -> SELECT f8 FROM t1; +> SELECT f5 FROM t1; "0000-00-00 00:00:00" "0000-00-00 00:00:00" -> SELECT f9 FROM t1; +> SELECT f6 FROM t1; "0000-00-00 00:00:00.0000" "0000-00-00 00:00:00.0000" $ set-regex match="DETAILS = '[a-f0-9]+'" replacement=
>[version<2600700] SHOW CREATE TABLE t1; -materialize.public.t1 "CREATE TABLE materialize.public.t1 (f1 pg_catalog.text, f2 pg_catalog.text, f3 pg_catalog.text, f4 pg_catalog.text, f5 pg_catalog.text, f6 pg_catalog.text, f7 pg_catalog.text, f8 pg_catalog.text, f9 pg_catalog.text) FROM SOURCE materialize.public.da (REFERENCE = public.t1) WITH (TEXT COLUMNS = (f1, f2, f3, f4, f5, f6, f7, f8, f9),
);" +materialize.public.t1 "CREATE TABLE materialize.public.t1 (f1 pg_catalog.text, f2 pg_catalog.text, f3 pg_catalog.text, f4 pg_catalog.text, f5 pg_catalog.text, f6 pg_catalog.text) FROM SOURCE materialize.public.da (REFERENCE = public.t1) WITH (TEXT COLUMNS = (f1, f2, f3, f4, f5, f6),
);" >[version>=2600700] SHOW CREATE TABLE t1; -materialize.public.t1 "CREATE TABLE\n materialize.public.t1\n (\n f1 pg_catalog.text,\n f2 pg_catalog.text,\n f3 pg_catalog.text,\n f4 pg_catalog.text,\n f5 pg_catalog.text,\n f6 pg_catalog.text,\n f7 pg_catalog.text,\n f8 pg_catalog.text,\n f9 pg_catalog.text\n )\nFROM SOURCE materialize.public.da (REFERENCE = public.t1)\nWITH (\n TEXT COLUMNS = (f1, f2, f3, f4, f5, f6, f7, f8, f9),\n
\n);" +materialize.public.t1 "CREATE TABLE\n materialize.public.t1\n (\n f1 pg_catalog.text,\n f2 pg_catalog.text,\n f3 pg_catalog.text,\n f4 pg_catalog.text,\n f5 pg_catalog.text,\n f6 pg_catalog.text\n )\nFROM SOURCE materialize.public.da (REFERENCE = public.t1)\nWITH (\n TEXT COLUMNS = (f1, f2, f3, f4, f5, f6),\n
\n);" > DROP SOURCE da CASCADE; diff --git a/test/mysql-cdc/text-columns-date.td b/test/mysql-cdc/text-columns-date.td new file mode 100644 index 0000000000000..4bf5a8eed392c --- /dev/null +++ b/test/mysql-cdc/text-columns-date.td @@ -0,0 +1,150 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# +# Regression test for MySQL DATE columns ingested via TEXT COLUMNS, +# with a mix of valid values and the zero-date sentinel. TEXT COLUMNS +# is the documented workaround for values that cannot be represented +# as a Materialize DATE (notably '0000-00-00'). +# +# Per https://dev.mysql.com/doc/refman/8.0/en/datetime.html: +# * DATE range: '1000-01-01' to '9999-12-31' +# +# Both the snapshot path and the binlog replication path decode DATE +# as Value::Date(y, m, d, 0, 0, 0, 0), but we exercise both paths to +# guard against future divergence (the TIMESTAMP/DATETIME path has +# multiple wire variants and previously regressed). + +> CREATE SECRET mysqlpass AS '${arg.mysql-root-password}' + +> CREATE CONNECTION mysqc TO MYSQL ( + HOST mysql, + USER root, + PASSWORD SECRET mysqlpass + ) + +$ mysql-connect name=mysql url=mysql://root@mysql password=${arg.mysql-root-password} + +# sql_mode = '' is required so MySQL accepts the zero-date that motivates +# the use of TEXT COLUMNS in the first place. +$ mysql-execute name=mysql +DROP DATABASE IF EXISTS public; +CREATE DATABASE public; +USE public; +SET SESSION sql_mode = ''; +CREATE TABLE events (id INT PRIMARY KEY, event_date DATE NULL); +START TRANSACTION; +INSERT INTO events VALUES (1, '2024-04-03'), (2, '0000-00-00'), (3, NULL); +# Boundary rows: min and max valid DATE values. +INSERT INTO events VALUES (4, '1000-01-01'), (5, '9999-12-31'); +# Partial-zero forms: zero month and zero day. MySQL accepts these +# under sql_mode='' and they, like the full zero-date, have no +# NaiveDate representation — so they only ingest via TEXT COLUMNS. +INSERT INTO events VALUES (11, '2024-00-01'), (12, '2024-01-00'); +COMMIT; + +> BEGIN +> CREATE SOURCE da + FROM MYSQL CONNECTION mysqc; +> CREATE TABLE events FROM SOURCE da (REFERENCE public.events) + WITH (TEXT COLUMNS = (event_date)); +> COMMIT + +# Post-snapshot rows exercise the replication / binlog decode path. +$ mysql-execute name=mysql +USE public; +SET SESSION sql_mode = ''; +START TRANSACTION; +INSERT INTO events VALUES (6, '2025-04-03'), (7, '0000-00-00'), (8, NULL); +INSERT INTO events VALUES (9, '1000-01-01'), (10, '9999-12-31'); +INSERT INTO events VALUES (13, '2024-00-01'), (14, '2024-01-00'); +COMMIT; + +> SELECT id, event_date FROM events ORDER BY id; +1 "2024-04-03" +2 "0000-00-00" +3 +4 "1000-01-01" +5 "9999-12-31" +6 "2025-04-03" +7 "0000-00-00" +8 +9 "1000-01-01" +10 "9999-12-31" +11 "2024-00-01" +12 "2024-01-00" +13 "2024-00-01" +14 "2024-01-00" + +# Verify the column type was rewritten to text by TEXT COLUMNS. +> SELECT pg_typeof(event_date) FROM events LIMIT 1; +text + +# None of the data above should have caused the source to go into a stalled state. +> SELECT name, status, error IS NULL FROM mz_internal.mz_source_statuses WHERE name IN ('da', 'events') ORDER BY name; +da running true +events running true + +> DROP SOURCE da CASCADE; + +# +# Negative path: a DATE column that is NOT declared as a TEXT COLUMN +# cannot ingest the '0000-00-00' zero-date. Materialize's DATE type +# (backed by chrono::NaiveDate) has no representation for it, so the +# source must surface a decode error. This guards the documented +# workaround: declare such columns in TEXT COLUMNS. +# + +$ mysql-execute name=mysql +DROP DATABASE IF EXISTS public; +CREATE DATABASE public; +USE public; +SET SESSION sql_mode = ''; +CREATE TABLE reports (id INT PRIMARY KEY, created_at DATE NULL); +INSERT INTO reports VALUES (1, '2024-04-03'), (2, '0000-00-00'); + +> BEGIN +> CREATE SOURCE da + FROM MYSQL CONNECTION mysqc; +> CREATE TABLE reports FROM SOURCE da (REFERENCE public.reports); +> COMMIT + +# Snapshot path: zero-date present at source-creation time. +! SELECT * FROM reports; +contains: error decoding value + +> DROP SOURCE da CASCADE; + +# Binlog path: source starts clean, zero-date arrives via replication. +$ mysql-execute name=mysql +DROP DATABASE IF EXISTS public; +CREATE DATABASE public; +USE public; +SET SESSION sql_mode = ''; +CREATE TABLE reports (id INT PRIMARY KEY, created_at DATE NULL); +INSERT INTO reports VALUES (1, '2024-04-03'); + +> BEGIN +> CREATE SOURCE da + FROM MYSQL CONNECTION mysqc; +> CREATE TABLE reports FROM SOURCE da (REFERENCE public.reports); +> COMMIT + +> SELECT * FROM reports; +1 "2024-04-03" + +$ mysql-execute name=mysql +USE public; +SET SESSION sql_mode = ''; +INSERT INTO reports VALUES (2, '0000-00-00'); + +! SELECT * FROM reports; +contains: error decoding value + +> DROP SOURCE da CASCADE; From 712ddbb248259d286bd654b73eeb375a02ac1855 Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Tue, 19 May 2026 22:35:36 -0400 Subject: [PATCH 8/9] consolidate DATETIME/TIMESTAMP with TEXT COLUMNS tests --- test/mysql-cdc/30-text-columns.td | 30 ++++++------------------ test/mysql-cdc/text-columns-timestamp.td | 29 ++++++++++++++++------- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/test/mysql-cdc/30-text-columns.td b/test/mysql-cdc/30-text-columns.td index 33cff4e38e6e6..0b8e438fed6d7 100644 --- a/test/mysql-cdc/30-text-columns.td +++ b/test/mysql-cdc/30-text-columns.td @@ -23,30 +23,26 @@ $ mysql-connect name=mysql url=mysql://root@mysql password=${arg.mysql-root-password} # Insert data into MySQL that can't be decoded using native types and must be decoded -# as a TEXT COLUMN -# NOTE: We need to use `sql_mode = ''` to have MySQL allow the 0000-00-00 dates which it -# correctly disallows by default in newer versions, but used to allow in previous ones. -# DATE-type TEXT COLUMNS coverage lives in text-columns-date.td. +# as a TEXT COLUMN. DATE-type coverage lives in text-columns-date.td; +# TIMESTAMP/DATETIME coverage lives in text-columns-timestamp.td. $ mysql-execute name=mysql DROP DATABASE IF EXISTS public; CREATE DATABASE public; USE public; -CREATE TABLE t1 (f1 JSON, f2 ENUM('small', 'medium', 'large'), f3 YEAR, f4 DATETIME, f5 DATETIME, f6 DATETIME(4)); -SET SESSION sql_mode = ''; -INSERT INTO t1 VALUES (CAST('{"bar": "baz", "balance": 7.77, "active": false, "nest": {"birds": ["seagull", "robin"]}}' AS JSON), 'large', 2024, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00.000000'); +CREATE TABLE t1 (f1 JSON, f2 ENUM('small', 'medium', 'large'), f3 YEAR); +INSERT INTO t1 VALUES (CAST('{"bar": "baz", "balance": 7.77, "active": false, "nest": {"birds": ["seagull", "robin"]}}' AS JSON), 'large', 2024); > BEGIN > CREATE SOURCE da FROM MYSQL CONNECTION mysqc; > CREATE TABLE t1 FROM SOURCE da (REFERENCE public.t1) - WITH (TEXT COLUMNS = (f1, f2, f3, f4, f5, f6)); + WITH (TEXT COLUMNS = (f1, f2, f3)); > COMMIT # Insert the same data post-snapshot $ mysql-execute name=mysql USE public; -SET SESSION sql_mode = ''; INSERT INTO t1 SELECT * FROM t1; > SELECT f1::jsonb->>'balance' FROM t1; @@ -66,25 +62,13 @@ INSERT INTO t1 SELECT * FROM t1; "{\"bar\":\"baz\",\"nest\":{\"birds\":[\"seagull\",\"robin\"]},\"active\":false,\"balance\":7.77}" "{\"bar\":\"baz\",\"nest\":{\"birds\":[\"seagull\",\"robin\"]},\"active\":false,\"balance\":7.77}" -> SELECT f4 FROM t1; -"0000-00-00 00:00:00" -"0000-00-00 00:00:00" - -> SELECT f5 FROM t1; -"0000-00-00 00:00:00" -"0000-00-00 00:00:00" - -> SELECT f6 FROM t1; -"0000-00-00 00:00:00.0000" -"0000-00-00 00:00:00.0000" - $ set-regex match="DETAILS = '[a-f0-9]+'" replacement=
>[version<2600700] SHOW CREATE TABLE t1; -materialize.public.t1 "CREATE TABLE materialize.public.t1 (f1 pg_catalog.text, f2 pg_catalog.text, f3 pg_catalog.text, f4 pg_catalog.text, f5 pg_catalog.text, f6 pg_catalog.text) FROM SOURCE materialize.public.da (REFERENCE = public.t1) WITH (TEXT COLUMNS = (f1, f2, f3, f4, f5, f6),
);" +materialize.public.t1 "CREATE TABLE materialize.public.t1 (f1 pg_catalog.text, f2 pg_catalog.text, f3 pg_catalog.text) FROM SOURCE materialize.public.da (REFERENCE = public.t1) WITH (TEXT COLUMNS = (f1, f2, f3),
);" >[version>=2600700] SHOW CREATE TABLE t1; -materialize.public.t1 "CREATE TABLE\n materialize.public.t1\n (\n f1 pg_catalog.text,\n f2 pg_catalog.text,\n f3 pg_catalog.text,\n f4 pg_catalog.text,\n f5 pg_catalog.text,\n f6 pg_catalog.text\n )\nFROM SOURCE materialize.public.da (REFERENCE = public.t1)\nWITH (\n TEXT COLUMNS = (f1, f2, f3, f4, f5, f6),\n
\n);" +materialize.public.t1 "CREATE TABLE materialize.public.t1 (f1 pg_catalog.text, f2 pg_catalog.text, f3 pg_catalog.text)\nFROM SOURCE materialize.public.da (REFERENCE = public.t1)\nWITH (\n TEXT COLUMNS = (f1, f2, f3),\n
\n);" > DROP SOURCE da CASCADE; diff --git a/test/mysql-cdc/text-columns-timestamp.td b/test/mysql-cdc/text-columns-timestamp.td index 4023151d7895b..c97d9ab9d3a79 100644 --- a/test/mysql-cdc/text-columns-timestamp.td +++ b/test/mysql-cdc/text-columns-timestamp.td @@ -39,21 +39,21 @@ DROP DATABASE IF EXISTS public; CREATE DATABASE public; USE public; SET SESSION sql_mode = ''; -CREATE TABLE products (id INT PRIMARY KEY, created_at TIMESTAMP NULL, updated_at TIMESTAMP(6) NULL, archived_at DATETIME NULL, born_at DATETIME(6) NULL); +CREATE TABLE products (id INT PRIMARY KEY, created_at TIMESTAMP NULL, updated_at TIMESTAMP(6) NULL, archived_at DATETIME NULL, born_at DATETIME(6) NULL, mid_at DATETIME(4) NULL); START TRANSACTION; -INSERT INTO products VALUES (1, '2024-04-03 10:15:13', '2024-04-03 10:15:13.123456', '2024-04-03 10:15:13', '2024-04-03 10:15:13.123456'), (2, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000'), (3, NULL, NULL, NULL, NULL); +INSERT INTO products VALUES (1, '2024-04-03 10:15:13', '2024-04-03 10:15:13.123456', '2024-04-03 10:15:13', '2024-04-03 10:15:13.123456', '2024-04-03 10:15:13.1234'), (2, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00.0000'), (3, NULL, NULL, NULL, NULL, NULL); # Rows 7-8: DATETIME values that are outside the TIMESTAMP range # (year 1001 is before TIMESTAMP's 1970 start; year 9999 is well beyond # its 2038 end). TIMESTAMP columns must be NULL — these values can't # be stored there at all. -INSERT INTO products VALUES (7, NULL, NULL, '1001-01-01 00:00:00', '1001-01-01 00:00:00.000001'), (8, NULL, NULL, '9999-12-31 23:59:59', '9999-12-31 23:59:59.999999'); +INSERT INTO products VALUES (7, NULL, NULL, '1001-01-01 00:00:00', '1001-01-01 00:00:00.000001', '1001-01-01 00:00:00.0001'), (8, NULL, NULL, '9999-12-31 23:59:59', '9999-12-31 23:59:59.999999', '9999-12-31 23:59:59.9999'); COMMIT; > BEGIN > CREATE SOURCE da FROM MYSQL CONNECTION mysqc; > CREATE TABLE products FROM SOURCE da (REFERENCE public.products) - WITH (TEXT COLUMNS = (created_at, updated_at, archived_at, born_at)); + WITH (TEXT COLUMNS = (created_at, updated_at, archived_at, born_at, mid_at)); > COMMIT # Post-snapshot rows exercise the replication / binlog decode path, @@ -62,8 +62,8 @@ $ mysql-execute name=mysql USE public; SET SESSION sql_mode = ''; START TRANSACTION; -INSERT INTO products VALUES (4, '2025-04-03 09:01:53', '2025-04-03 09:01:53.987654', '2025-04-03 09:01:53', '2025-04-03 09:01:53.987654'), (5, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000'), (6, NULL, NULL, NULL, NULL); -INSERT INTO products VALUES (9, NULL, NULL, '1001-01-01 00:00:00', '1001-01-01 00:00:00.000001'), (10, NULL, NULL, '9999-12-31 23:59:59', '9999-12-31 23:59:59.999999'); +INSERT INTO products VALUES (4, '2025-04-03 09:01:53', '2025-04-03 09:01:53.987654', '2025-04-03 09:01:53', '2025-04-03 09:01:53.987654', '2025-04-03 09:01:53.9876'), (5, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00.0000'), (6, NULL, NULL, NULL, NULL, NULL); +INSERT INTO products VALUES (9, NULL, NULL, '1001-01-01 00:00:00', '1001-01-01 00:00:00.000001', '1001-01-01 00:00:00.0001'), (10, NULL, NULL, '9999-12-31 23:59:59', '9999-12-31 23:59:59.999999', '9999-12-31 23:59:59.9999'); COMMIT; > SELECT id, created_at FROM products ORDER BY id; @@ -114,9 +114,22 @@ COMMIT; 9 "1001-01-01 00:00:00.000001" 10 "9999-12-31 23:59:59.999999" +# DATETIME(4) — mid-precision case to exercise non-{0,6} fractional widths. +> SELECT id, mid_at FROM products ORDER BY id; +1 "2024-04-03 10:15:13.1234" +2 "0000-00-00 00:00:00.0000" +3 +4 "2025-04-03 09:01:53.9876" +5 "0000-00-00 00:00:00.0000" +6 +7 "1001-01-01 00:00:00.0001" +8 "9999-12-31 23:59:59.9999" +9 "1001-01-01 00:00:00.0001" +10 "9999-12-31 23:59:59.9999" + # Verify the column types were rewritten to text by TEXT COLUMNS. -> SELECT pg_typeof(created_at), pg_typeof(updated_at), pg_typeof(archived_at), pg_typeof(born_at) FROM products LIMIT 1; -text text text text +> SELECT pg_typeof(created_at), pg_typeof(updated_at), pg_typeof(archived_at), pg_typeof(born_at), pg_typeof(mid_at) FROM products LIMIT 1; +text text text text text # None of the data above should have caused the source to go into a stalled state. This may be # slight overkill as a check. We've verified that all the rows are readable in previous From 95e8ec81e85d3300da95be7e4416d4492efb73d5 Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Tue, 19 May 2026 22:36:43 -0400 Subject: [PATCH 9/9] Fix microsecond truncation bug for DATETIME/TIMESTAMP with precision 1-5 --- src/mysql-util/src/decoding.rs | 74 ++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/src/mysql-util/src/decoding.rs b/src/mysql-util/src/decoding.rs index fd1fdff749509..53b78be10b65b 100644 --- a/src/mysql-util/src/decoding.rs +++ b/src/mysql-util/src/decoding.rs @@ -31,6 +31,11 @@ use crate::{MySqlColumnDesc, MySqlError, MySqlTableDesc}; /// unambiguously this sentinel. const MYSQL_ZERO_TIMESTAMP: &str = "0000-00-00 00:00:00"; +/// Maximum fractional-seconds precision MySQL accepts for DATETIME(p) and +/// TIMESTAMP(p) — values are stored in microseconds, so 6 digits is the +/// upper bound (). +const MYSQL_MAX_FRACTIONAL_PRECISION: u32 = 6; + /// Format the zero-date sentinel for a column with the given fractional /// precision (matches the Date arm's `{:0precision$}` behavior). fn mysql_zero_timestamp(precision: u32) -> String { @@ -45,6 +50,32 @@ fn mysql_zero_timestamp(precision: u32) -> String { } } +/// Format MySQL DATETIME/TIMESTAMP components as `YYYY-MM-DD HH:MM:SS[.ffff]`. +/// `micros` is the raw microseconds (0..1_000_000); only the leading +/// `precision` digits are kept, matching MySQL's DATETIME(p)/TIMESTAMP(p) +/// display. +fn format_mysql_timestamp( + y: u16, + m: u8, + d: u8, + hr: u8, + min: u8, + sec: u8, + micros: u32, + precision: u32, +) -> String { + if precision == 0 { + return format!("{y:04}-{m:02}-{d:02} {hr:02}:{min:02}:{sec:02}"); + } + // Clamp defensively: MySQL itself rejects precision > 6, but upstream + // metadata is untrusted and a larger value would make `pow()` below + // overflow its u32 exponent. + let p = precision.min(MYSQL_MAX_FRACTIONAL_PRECISION); + let scaled = micros / 10u32.pow(MYSQL_MAX_FRACTIONAL_PRECISION - p); + let width = usize::cast_from(p); + format!("{y:04}-{m:02}-{d:02} {hr:02}:{min:02}:{sec:02}.{scaled:0width$}") +} + pub fn pack_mysql_row( row_container: &mut Row, row: MySqlRow, @@ -362,25 +393,7 @@ fn pack_val_as_datum( // "" or "." (binlog/value.rs:145-154) let str_timestamp = match value { Value::Date(y, m, d, h, mm, s, ms) => { - if *precision > 0 { - let precision: usize = (*precision).try_into()?; - format!( - "{:04}-{:02}-{:02} {:02}:{:02}:{:02}.{:0precision$}", - y, - m, - d, - h, - mm, - s, - ms, - precision = precision - ) - } else { - format!( - "{:04}-{:02}-{:02} {:02}:{:02}:{:02}", - y, m, d, h, mm, s - ) - } + format_mysql_timestamp(y, m, d, h, mm, s, ms, *precision) } // Pre-5.6 unix epoch, no fractional seconds. // val == 0 is the zero-date sentinel, not epoch 0. @@ -411,12 +424,23 @@ fn pack_val_as_datum( .map_err(|_| { anyhow::anyhow!("received invalid timestamp value: {:?}", s) })?; - if *precision > 0 { - let p: usize = (*precision).try_into()?; - dt.format(&format!("%Y-%m-%d %H:%M:%S.%{p}f")).to_string() - } else { - dt.format("%Y-%m-%d %H:%M:%S").to_string() - } + use chrono::{Datelike, Timelike}; + let y = u16::try_from(dt.year()).map_err(|_| { + anyhow::anyhow!( + "timestamp year out of range: {}", + dt.year() + ) + })?; + format_mysql_timestamp( + y, + u8::try_from(dt.month())?, + u8::try_from(dt.day())?, + u8::try_from(dt.hour())?, + u8::try_from(dt.minute())?, + u8::try_from(dt.second())?, + dt.nanosecond() / 1000, + *precision, + ) } } _ => Err(anyhow::anyhow!(