diff --git a/src/mysql-util/src/decoding.rs b/src/mysql-util/src/decoding.rs index 2c73bac879737..53b78be10b65b 100644 --- a/src/mysql-util/src/decoding.rs +++ b/src/mysql-util/src/decoding.rs @@ -24,6 +24,58 @@ use mz_repr::{Datum, Row, RowPacker, SqlScalarType}; use crate::desc::MySqlColumnMeta; use crate::{MySqlColumnDesc, MySqlError, MySqlTableDesc}; +/// Canonical text for the MySQL zero-date sentinel ('0000-00-00 00:00:00'). +/// In binlog MYSQL_TYPE_DATETIME/MYSQL_TYPE_DATETIME2 encodings, sec=0 *cannot* represent unix +/// epoch 0. The TIMESTAMP type's supported range starts at '1970-01-01 00:00:01' UTC +/// (), so any sec=0 is +/// unambiguously this sentinel. +const MYSQL_ZERO_TIMESTAMP: &str = "0000-00-00 00:00:00"; + +/// Maximum fractional-seconds precision MySQL accepts for DATETIME(p) and +/// TIMESTAMP(p) — values are stored in microseconds, so 6 digits is the +/// upper bound (). +const MYSQL_MAX_FRACTIONAL_PRECISION: u32 = 6; + +/// Format the zero-date sentinel for a column with the given fractional +/// precision (matches the Date arm's `{:0precision$}` behavior). +fn mysql_zero_timestamp(precision: u32) -> String { + if precision > 0 { + format!( + "{}.{}", + MYSQL_ZERO_TIMESTAMP, + "0".repeat(usize::cast_from(precision)) + ) + } else { + MYSQL_ZERO_TIMESTAMP.to_string() + } +} + +/// Format MySQL DATETIME/TIMESTAMP components as `YYYY-MM-DD HH:MM:SS[.ffff]`. +/// `micros` is the raw microseconds (0..1_000_000); only the leading +/// `precision` digits are kept, matching MySQL's DATETIME(p)/TIMESTAMP(p) +/// display. +fn format_mysql_timestamp( + y: u16, + m: u8, + d: u8, + hr: u8, + min: u8, + sec: u8, + micros: u32, + precision: u32, +) -> String { + if precision == 0 { + return format!("{y:04}-{m:02}-{d:02} {hr:02}:{min:02}:{sec:02}"); + } + // Clamp defensively: MySQL itself rejects precision > 6, but upstream + // metadata is untrusted and a larger value would make `pow()` below + // overflow its u32 exponent. + let p = precision.min(MYSQL_MAX_FRACTIONAL_PRECISION); + let scaled = micros / 10u32.pow(MYSQL_MAX_FRACTIONAL_PRECISION - p); + let width = usize::cast_from(p); + format!("{y:04}-{m:02}-{d:02} {hr:02}:{min:02}:{sec:02}.{scaled:0width$}") +} + pub fn pack_mysql_row( row_container: &mut Row, row: MySqlRow, @@ -325,34 +377,78 @@ fn pack_val_as_datum( } } Some(MySqlColumnMeta::Timestamp(precision)) => { - // Some MySQL dates are invalid in chrono/NaiveDate (e.g. 0000-00-00), so - // we need to handle them directly as strings - if let Value::Date(y, m, d, h, mm, s, ms) = value { - if *precision > 0 { - let precision: usize = (*precision).try_into()?; - packer.push(Datum::String(&format!( - "{:04}-{:02}-{:02} {:02}:{:02}:{:02}.{:0precision$}", - y, - m, - d, - h, - mm, - s, - ms, - precision = precision - ))); - } else { - packer.push(Datum::String(&format!( - "{:04}-{:02}-{:02} {:02}:{:02}:{:02}", - y, m, d, h, mm, s - ))); + // Materialize treats DATETIME and TIMESTAMP as MySqlColumnMeta::Timestamp, + // but they have slightly different semantics as far as the range of dates + // they can represent. + // (see https://dev.mysql.com/doc/refman/8.0/en/date-and-time-types.html). + // + // Three mysql_common::Value variants exist, which are mapped to + // [`MySqlColumnMeta::Timestamp`] + // (see https://github.com/blackbeam/rust_mysql_common/blob/2e6f6696de03c91b9fd95a87356d081285290704/src/binlog/value.rs): + // Value::Date — MZ snapshot & binlog MYSQL_TYPE_DATETIME/MYSQL_TYPE_DATETIME2 + // (value/mod.rs:443-445, binlog/value.rs:109-161) + // Value::Int — legacy binlog MYSQL_TYPE_TIMESTAMP, pre-5.6, + // 4-byte unix epoch (binlog/value.rs:87-90) + // Value::Bytes — binlog MYSQL_TYPE_TIMESTAMP2, 5.6+, + // "" or "." (binlog/value.rs:145-154) + let str_timestamp = match value { + Value::Date(y, m, d, h, mm, s, ms) => { + format_mysql_timestamp(y, m, d, h, mm, s, ms, *precision) } - } else { - Err(anyhow::anyhow!( + // Pre-5.6 unix epoch, no fractional seconds. + // val == 0 is the zero-date sentinel, not epoch 0. + Value::Int(0) => mysql_zero_timestamp(*precision), + Value::Int(val) => chrono::DateTime::from_timestamp(val, 0) + .ok_or_else(|| { + anyhow::anyhow!("received invalid timestamp value: {}", val) + })? + .naive_utc() + .format("%Y-%m-%d %H:%M:%S") + .to_string(), + // 5.6+ epoch string; parse + reformat so all variants emit the + // same canonical YYYY-MM-DD HH:MM:SS[.ffff] text. + Value::Bytes(data) => { + let s = std::str::from_utf8(&data).map_err(|_| { + anyhow::anyhow!("received invalid timestamp value: {:?}", data) + })?; + // sec=0 (with or without fractional component) is the + // zero-date sentinel. + if s.split('.').next() == Some("0") { + mysql_zero_timestamp(*precision) + } else { + let dt = if s.contains('.') { + chrono::NaiveDateTime::parse_from_str(s, "%s%.6f") + } else { + chrono::NaiveDateTime::parse_from_str(s, "%s") + } + .map_err(|_| { + anyhow::anyhow!("received invalid timestamp value: {:?}", s) + })?; + use chrono::{Datelike, Timelike}; + let y = u16::try_from(dt.year()).map_err(|_| { + anyhow::anyhow!( + "timestamp year out of range: {}", + dt.year() + ) + })?; + format_mysql_timestamp( + y, + u8::try_from(dt.month())?, + u8::try_from(dt.day())?, + u8::try_from(dt.hour())?, + u8::try_from(dt.minute())?, + u8::try_from(dt.second())?, + dt.nanosecond() / 1000, + *precision, + ) + } + } + _ => Err(anyhow::anyhow!( "received unexpected value for timestamp type: {:?}", value - ))?; - } + ))?, + }; + packer.push(Datum::String(&str_timestamp)); } Some(MySqlColumnMeta::Bit(_)) => unreachable!("parsed as a u64"), None => { @@ -396,8 +492,8 @@ fn pack_val_as_datum( // Timestamps are encoded as different mysql_common::Value types depending on // whether they are from a binlog event or a query, and depending on which // mysql timestamp version is used. We handle those cases here - // https://github.com/blackbeam/rust_mysql_common/blob/v0.31.0/src/binlog/value.rs#L87-L155 - // https://github.com/blackbeam/rust_mysql_common/blob/v0.31.0/src/value/mod.rs#L332 + // https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/binlog/value.rs#L87-L155 + // https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/value/mod.rs#L332 let chrono_timestamp = match value { Value::Date(..) => from_value_opt::(value)?, // old temporal format from before MySQL 5.6; didn't support fractional seconds @@ -481,3 +577,200 @@ fn check_char_length( } Ok(()) } + +#[cfg(test)] +mod tests { + //! Unit tests for the TEXT-COLUMNS decoding of MySQL TIMESTAMP values. + //! + //! These cover the regression where a MySQL TIMESTAMP column declared as + //! a TEXT COLUMN fails to decode when the wire value arrives as + //! `Value::Bytes("")` or `Value::Int()` instead + //! of `Value::Date(..)`. The integration test in + //! `test/mysql-cdc/text-columns-timestamp.td` exercises this through + //! a real MySQL container but is non-deterministic: which `Value` + //! variant `mysql-async` produces depends on connection-state timing. + //! These unit tests pin each variant down directly. + //! + //! The wire-variant matrix exercised below is derived from mysql_common + //! v0.35.5: + //! + //! * Value::Int(epoch) — binlog MYSQL_TYPE_TIMESTAMP (pre-5.6): + //! https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/binlog/value.rs#L87-L90 + //! * Value::Bytes(""/".") — binlog MYSQL_TYPE_TIMESTAMP2 (5.6+): + //! https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/binlog/value.rs#L145-L154 + //! * Value::Date(...) — binary query response + binlog DATETIME[2]: + //! https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/value/mod.rs#L443-L445 + //! https://github.com/blackbeam/rust_mysql_common/blob/v0.35.5/src/binlog/value.rs#L109-L161 + //! + //! MySQL semantics referenced by the zero-date and fractional-precision + //! cases: + //! + //! * Zero-date allowed when sql_mode disables NO_ZERO_DATE: + //! https://dev.mysql.com/doc/refman/8.0/en/sql-mode.html#sqlmode_no_zero_date + //! * TIMESTAMP(p) / DATETIME(p) fractional seconds: + //! https://dev.mysql.com/doc/refman/8.0/en/fractional-seconds.html + use super::*; + use mz_repr::{SqlColumnType, SqlScalarType}; + + fn timestamp_text_col(precision: u32) -> MySqlColumnDesc { + MySqlColumnDesc { + name: "created_at".to_string(), + column_type: Some(SqlColumnType { + scalar_type: SqlScalarType::String, + nullable: true, + }), + meta: Some(MySqlColumnMeta::Timestamp(precision)), + } + } + + fn pack_one(value: Value, col: &MySqlColumnDesc) -> Result { + let mut row = Row::default(); + pack_val_as_datum(value, col, &mut row.packer())?; + Ok(row.unpack_first().unwrap_str().to_string()) + } + + #[mz_ore::test] + fn timestamp_value_date_no_precision() { + let col = timestamp_text_col(0); + let s = pack_one(Value::Date(2024, 4, 3, 10, 15, 13, 0), &col).unwrap(); + assert_eq!(s, "2024-04-03 10:15:13"); + } + + #[mz_ore::test] + fn timestamp_value_date_with_precision() { + let col = timestamp_text_col(6); + let s = pack_one(Value::Date(2024, 4, 3, 10, 15, 13, 123456), &col).unwrap(); + assert_eq!(s, "2024-04-03 10:15:13.123456"); + } + + #[mz_ore::test] + fn timestamp_value_date_zero_date() { + // The whole reason TEXT COLUMNS exists for TIMESTAMP: a + // zero-date arriving as Value::Date(0,..) should decode to the same + // "zero" timestamp value MySQL would display. + let col = timestamp_text_col(0); + let s = pack_one(Value::Date(0, 0, 0, 0, 0, 0, 0), &col).unwrap(); + assert_eq!(s, "0000-00-00 00:00:00"); + } + + /// Regression: Value::Int (pre-5.6 legacy temporal format, unix + /// epoch seconds) was previously rejected with + /// `received unexpected value for timestamp type: Int(..)`. + #[mz_ore::test] + fn timestamp_value_int_epoch() { + let col = timestamp_text_col(0); + // 1743661234 == 2025-04-03 06:20:34 UTC + let s = pack_one(Value::Int(1_743_661_234), &col).unwrap(); + assert_eq!(s, "2025-04-03 06:20:34"); + } + + /// sec=0 in the legacy TIMESTAMP encoding is the zero-date sentinel, + /// not unix epoch 0 — TIMESTAMP's range starts at '1970-01-01 00:00:01' + /// UTC so epoch 0 isn't a representable column value. + #[mz_ore::test] + fn timestamp_value_int_zero_is_sentinel() { + let col = timestamp_text_col(0); + let s = pack_one(Value::Int(0), &col).unwrap(); + assert_eq!(s, "0000-00-00 00:00:00"); + } + + /// Out-of-range epochs must error rather than silently producing + /// a zero-timestamp — they aren't the MySQL zero-date marker, just + /// garbage chrono can't represent. + #[mz_ore::test] + fn timestamp_value_int_out_of_range_errors() { + let col = timestamp_text_col(0); + let err = pack_one(Value::Int(i64::MAX), &col).unwrap_err(); + assert!( + err.to_string().contains("invalid timestamp value"), + "unexpected error message: {err}" + ); + } + + /// Regression: Value::Bytes carrying a unix-epoch string is the + /// wire variant that triggered the production failure + /// received unexpected value for timestamp type: Bytes("17436613..") + #[mz_ore::test] + fn timestamp_value_bytes_epoch() { + let col = timestamp_text_col(0); + let s = pack_one(Value::Bytes(b"1743661234".to_vec()), &col).unwrap(); + assert_eq!(s, "2025-04-03 06:20:34"); + } + + /// sec=0 in the TIMESTAMP2 encoding is the zero-date sentinel; same + /// reasoning as `timestamp_value_int_zero_is_sentinel`. + #[mz_ore::test] + fn timestamp_value_bytes_zero_is_sentinel() { + let col = timestamp_text_col(0); + let s = pack_one(Value::Bytes(b"0".to_vec()), &col).unwrap(); + assert_eq!(s, "0000-00-00 00:00:00"); + } + + /// Sentinel detection survives a fractional component ("0.NNNNNN"), + /// and the helper pads the output to the column's precision so that + /// snapshot and binlog paths produce identical text for the same + /// upstream row. + #[mz_ore::test] + fn timestamp_value_bytes_zero_with_fractional_is_sentinel() { + let col = timestamp_text_col(6); + let s = pack_one(Value::Bytes(b"0.000000".to_vec()), &col).unwrap(); + assert_eq!(s, "0000-00-00 00:00:00.000000"); + } + + /// Fractional form of the TIMESTAMP2 binlog encoding — + /// "." wrapped in Value::Bytes (binlog/value.rs:151-153). + /// Hits the `s.contains('.')` branch and the precision-aware + /// reformat. + #[mz_ore::test] + fn timestamp_value_bytes_epoch_fractional() { + let col = timestamp_text_col(6); + let s = pack_one(Value::Bytes(b"1743661234.123456".to_vec()), &col).unwrap(); + assert_eq!(s, "2025-04-03 06:20:34.123456"); + } + + /// Bytes that aren't valid UTF-8 should produce a meaningful error, + /// not a panic. + #[mz_ore::test] + fn timestamp_value_bytes_invalid_utf8_errors() { + let col = timestamp_text_col(0); + // 0xC3 0x28 is an invalid 2-byte UTF-8 sequence. + let err = pack_one(Value::Bytes(vec![0xC3, 0x28]), &col).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("invalid timestamp value"), + "unexpected error message: {msg}" + ); + } + + /// Bytes that are valid UTF-8 but not parseable as a unix epoch + /// should produce the same structured error as invalid UTF-8 — + /// covers the chrono parse failure path that + /// `timestamp_value_bytes_invalid_utf8_errors` doesn't reach. + #[mz_ore::test] + fn timestamp_value_bytes_unparseable_errors() { + let col = timestamp_text_col(0); + // "2024-04-03 10:15:13" is not valid because Value::Bytes must contain seconds since + // epoch, Value::Bytes(""/".") + for payload in [&b""[..], &b"not-an-epoch"[..], &b"2024-04-03 10:15:13"[..]] { + let err = pack_one(Value::Bytes(payload.to_vec()), &col).unwrap_err(); + assert!( + err.to_string().contains("invalid timestamp value"), + "payload {payload:?}: unexpected error message: {err}" + ); + } + } + + /// Variants that have no defined mapping for a TIMESTAMP column + /// must still produce the existing structured decode error so the + /// source health surface can flag them. + #[mz_ore::test] + fn timestamp_value_unsupported_variant_errors() { + let col = timestamp_text_col(0); + let err = pack_one(Value::Float(1.0), &col).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("unexpected value for timestamp"), + "unexpected error message: {msg}" + ); + } +} diff --git a/test/mysql-cdc/30-text-columns.td b/test/mysql-cdc/30-text-columns.td index f2cc2bb91afa7..0b8e438fed6d7 100644 --- a/test/mysql-cdc/30-text-columns.td +++ b/test/mysql-cdc/30-text-columns.td @@ -23,29 +23,26 @@ $ mysql-connect name=mysql url=mysql://root@mysql password=${arg.mysql-root-password} # Insert data into MySQL that can't be decoded using native types and must be decoded -# as a TEXT COLUMN -# NOTE: We need to use `sql_mode = ''` to have MySQL allow the 0000-00-00 dates which it -# correctly disallows by default in newer versions, but used to allow in previous ones. +# as a TEXT COLUMN. DATE-type coverage lives in text-columns-date.td; +# TIMESTAMP/DATETIME coverage lives in text-columns-timestamp.td. $ mysql-execute name=mysql DROP DATABASE IF EXISTS public; CREATE DATABASE public; USE public; -CREATE TABLE t1 (f1 JSON, f2 ENUM('small', 'medium', 'large'), f3 YEAR, f4 DATE, f5 DATE, f6 DATE, f7 DATETIME, f8 DATETIME, f9 DATETIME(4)); -SET SESSION sql_mode = ''; -INSERT INTO t1 VALUES (CAST('{"bar": "baz", "balance": 7.77, "active": false, "nest": {"birds": ["seagull", "robin"]}}' AS JSON), 'large', 2024, '0000-00-00', '2024-00-01', '2024-01-00', '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00.000000'); +CREATE TABLE t1 (f1 JSON, f2 ENUM('small', 'medium', 'large'), f3 YEAR); +INSERT INTO t1 VALUES (CAST('{"bar": "baz", "balance": 7.77, "active": false, "nest": {"birds": ["seagull", "robin"]}}' AS JSON), 'large', 2024); > BEGIN > CREATE SOURCE da FROM MYSQL CONNECTION mysqc; > CREATE TABLE t1 FROM SOURCE da (REFERENCE public.t1) - WITH (TEXT COLUMNS = (f1, f2, f3, f4, f5, f6, f7, f8, f9)); + WITH (TEXT COLUMNS = (f1, f2, f3)); > COMMIT # Insert the same data post-snapshot $ mysql-execute name=mysql USE public; -SET SESSION sql_mode = ''; INSERT INTO t1 SELECT * FROM t1; > SELECT f1::jsonb->>'balance' FROM t1; @@ -65,37 +62,13 @@ INSERT INTO t1 SELECT * FROM t1; "{\"bar\":\"baz\",\"nest\":{\"birds\":[\"seagull\",\"robin\"]},\"active\":false,\"balance\":7.77}" "{\"bar\":\"baz\",\"nest\":{\"birds\":[\"seagull\",\"robin\"]},\"active\":false,\"balance\":7.77}" -> SELECT f4 FROM t1; -0000-00-00 -0000-00-00 - -> SELECT f5 FROM t1; -2024-00-01 -2024-00-01 - -> SELECT f6 FROM t1; -2024-01-00 -2024-01-00 - -> SELECT f7 FROM t1; -"0000-00-00 00:00:00" -"0000-00-00 00:00:00" - -> SELECT f8 FROM t1; -"0000-00-00 00:00:00" -"0000-00-00 00:00:00" - -> SELECT f9 FROM t1; -"0000-00-00 00:00:00.0000" -"0000-00-00 00:00:00.0000" - $ set-regex match="DETAILS = '[a-f0-9]+'" replacement=
>[version<2600700] SHOW CREATE TABLE t1; -materialize.public.t1 "CREATE TABLE materialize.public.t1 (f1 pg_catalog.text, f2 pg_catalog.text, f3 pg_catalog.text, f4 pg_catalog.text, f5 pg_catalog.text, f6 pg_catalog.text, f7 pg_catalog.text, f8 pg_catalog.text, f9 pg_catalog.text) FROM SOURCE materialize.public.da (REFERENCE = public.t1) WITH (TEXT COLUMNS = (f1, f2, f3, f4, f5, f6, f7, f8, f9),
);" +materialize.public.t1 "CREATE TABLE materialize.public.t1 (f1 pg_catalog.text, f2 pg_catalog.text, f3 pg_catalog.text) FROM SOURCE materialize.public.da (REFERENCE = public.t1) WITH (TEXT COLUMNS = (f1, f2, f3),
);" >[version>=2600700] SHOW CREATE TABLE t1; -materialize.public.t1 "CREATE TABLE\n materialize.public.t1\n (\n f1 pg_catalog.text,\n f2 pg_catalog.text,\n f3 pg_catalog.text,\n f4 pg_catalog.text,\n f5 pg_catalog.text,\n f6 pg_catalog.text,\n f7 pg_catalog.text,\n f8 pg_catalog.text,\n f9 pg_catalog.text\n )\nFROM SOURCE materialize.public.da (REFERENCE = public.t1)\nWITH (\n TEXT COLUMNS = (f1, f2, f3, f4, f5, f6, f7, f8, f9),\n
\n);" +materialize.public.t1 "CREATE TABLE materialize.public.t1 (f1 pg_catalog.text, f2 pg_catalog.text, f3 pg_catalog.text)\nFROM SOURCE materialize.public.da (REFERENCE = public.t1)\nWITH (\n TEXT COLUMNS = (f1, f2, f3),\n
\n);" > DROP SOURCE da CASCADE; diff --git a/test/mysql-cdc/text-columns-date.td b/test/mysql-cdc/text-columns-date.td new file mode 100644 index 0000000000000..4bf5a8eed392c --- /dev/null +++ b/test/mysql-cdc/text-columns-date.td @@ -0,0 +1,150 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# +# Regression test for MySQL DATE columns ingested via TEXT COLUMNS, +# with a mix of valid values and the zero-date sentinel. TEXT COLUMNS +# is the documented workaround for values that cannot be represented +# as a Materialize DATE (notably '0000-00-00'). +# +# Per https://dev.mysql.com/doc/refman/8.0/en/datetime.html: +# * DATE range: '1000-01-01' to '9999-12-31' +# +# Both the snapshot path and the binlog replication path decode DATE +# as Value::Date(y, m, d, 0, 0, 0, 0), but we exercise both paths to +# guard against future divergence (the TIMESTAMP/DATETIME path has +# multiple wire variants and previously regressed). + +> CREATE SECRET mysqlpass AS '${arg.mysql-root-password}' + +> CREATE CONNECTION mysqc TO MYSQL ( + HOST mysql, + USER root, + PASSWORD SECRET mysqlpass + ) + +$ mysql-connect name=mysql url=mysql://root@mysql password=${arg.mysql-root-password} + +# sql_mode = '' is required so MySQL accepts the zero-date that motivates +# the use of TEXT COLUMNS in the first place. +$ mysql-execute name=mysql +DROP DATABASE IF EXISTS public; +CREATE DATABASE public; +USE public; +SET SESSION sql_mode = ''; +CREATE TABLE events (id INT PRIMARY KEY, event_date DATE NULL); +START TRANSACTION; +INSERT INTO events VALUES (1, '2024-04-03'), (2, '0000-00-00'), (3, NULL); +# Boundary rows: min and max valid DATE values. +INSERT INTO events VALUES (4, '1000-01-01'), (5, '9999-12-31'); +# Partial-zero forms: zero month and zero day. MySQL accepts these +# under sql_mode='' and they, like the full zero-date, have no +# NaiveDate representation — so they only ingest via TEXT COLUMNS. +INSERT INTO events VALUES (11, '2024-00-01'), (12, '2024-01-00'); +COMMIT; + +> BEGIN +> CREATE SOURCE da + FROM MYSQL CONNECTION mysqc; +> CREATE TABLE events FROM SOURCE da (REFERENCE public.events) + WITH (TEXT COLUMNS = (event_date)); +> COMMIT + +# Post-snapshot rows exercise the replication / binlog decode path. +$ mysql-execute name=mysql +USE public; +SET SESSION sql_mode = ''; +START TRANSACTION; +INSERT INTO events VALUES (6, '2025-04-03'), (7, '0000-00-00'), (8, NULL); +INSERT INTO events VALUES (9, '1000-01-01'), (10, '9999-12-31'); +INSERT INTO events VALUES (13, '2024-00-01'), (14, '2024-01-00'); +COMMIT; + +> SELECT id, event_date FROM events ORDER BY id; +1 "2024-04-03" +2 "0000-00-00" +3 +4 "1000-01-01" +5 "9999-12-31" +6 "2025-04-03" +7 "0000-00-00" +8 +9 "1000-01-01" +10 "9999-12-31" +11 "2024-00-01" +12 "2024-01-00" +13 "2024-00-01" +14 "2024-01-00" + +# Verify the column type was rewritten to text by TEXT COLUMNS. +> SELECT pg_typeof(event_date) FROM events LIMIT 1; +text + +# None of the data above should have caused the source to go into a stalled state. +> SELECT name, status, error IS NULL FROM mz_internal.mz_source_statuses WHERE name IN ('da', 'events') ORDER BY name; +da running true +events running true + +> DROP SOURCE da CASCADE; + +# +# Negative path: a DATE column that is NOT declared as a TEXT COLUMN +# cannot ingest the '0000-00-00' zero-date. Materialize's DATE type +# (backed by chrono::NaiveDate) has no representation for it, so the +# source must surface a decode error. This guards the documented +# workaround: declare such columns in TEXT COLUMNS. +# + +$ mysql-execute name=mysql +DROP DATABASE IF EXISTS public; +CREATE DATABASE public; +USE public; +SET SESSION sql_mode = ''; +CREATE TABLE reports (id INT PRIMARY KEY, created_at DATE NULL); +INSERT INTO reports VALUES (1, '2024-04-03'), (2, '0000-00-00'); + +> BEGIN +> CREATE SOURCE da + FROM MYSQL CONNECTION mysqc; +> CREATE TABLE reports FROM SOURCE da (REFERENCE public.reports); +> COMMIT + +# Snapshot path: zero-date present at source-creation time. +! SELECT * FROM reports; +contains: error decoding value + +> DROP SOURCE da CASCADE; + +# Binlog path: source starts clean, zero-date arrives via replication. +$ mysql-execute name=mysql +DROP DATABASE IF EXISTS public; +CREATE DATABASE public; +USE public; +SET SESSION sql_mode = ''; +CREATE TABLE reports (id INT PRIMARY KEY, created_at DATE NULL); +INSERT INTO reports VALUES (1, '2024-04-03'); + +> BEGIN +> CREATE SOURCE da + FROM MYSQL CONNECTION mysqc; +> CREATE TABLE reports FROM SOURCE da (REFERENCE public.reports); +> COMMIT + +> SELECT * FROM reports; +1 "2024-04-03" + +$ mysql-execute name=mysql +USE public; +SET SESSION sql_mode = ''; +INSERT INTO reports VALUES (2, '0000-00-00'); + +! SELECT * FROM reports; +contains: error decoding value + +> DROP SOURCE da CASCADE; diff --git a/test/mysql-cdc/text-columns-timestamp.td b/test/mysql-cdc/text-columns-timestamp.td new file mode 100644 index 0000000000000..c97d9ab9d3a79 --- /dev/null +++ b/test/mysql-cdc/text-columns-timestamp.td @@ -0,0 +1,141 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# +# Regression test for MySQL TIMESTAMP and DATETIME columns ingested via +# TEXT COLUMNS, with a mix of valid values, zero-dates, and out-of-range +# values. TEXT COLUMNS is the documented workaround for values that +# cannot be represented as a Materialize TIMESTAMP (notably +# '0000-00-00 00:00:00' and DATETIME values outside the TIMESTAMP range). +# +# Per https://dev.mysql.com/doc/refman/8.0/en/datetime.html: +# * DATETIME range: '1000-01-01 00:00:00.000000' to '9999-12-31 23:59:59.499999' +# * TIMESTAMP range: '1970-01-01 00:00:01.000000' to '2038-01-19 03:14:07.499999' +# +# The DATETIME rows here exercise values that are valid DATETIMEs but +# outside the TIMESTAMP range (year 1001, year 9999), so they can only +# be represented as text. + +> CREATE SECRET mysqlpass AS '${arg.mysql-root-password}' + +> CREATE CONNECTION mysqc TO MYSQL ( + HOST mysql, + USER root, + PASSWORD SECRET mysqlpass + ) + +$ mysql-connect name=mysql url=mysql://root@mysql password=${arg.mysql-root-password} + +# sql_mode = '' is required so MySQL accepts the zero-date that motivates +# the use of TEXT COLUMNS in the first place. +$ mysql-execute name=mysql +DROP DATABASE IF EXISTS public; +CREATE DATABASE public; +USE public; +SET SESSION sql_mode = ''; +CREATE TABLE products (id INT PRIMARY KEY, created_at TIMESTAMP NULL, updated_at TIMESTAMP(6) NULL, archived_at DATETIME NULL, born_at DATETIME(6) NULL, mid_at DATETIME(4) NULL); +START TRANSACTION; +INSERT INTO products VALUES (1, '2024-04-03 10:15:13', '2024-04-03 10:15:13.123456', '2024-04-03 10:15:13', '2024-04-03 10:15:13.123456', '2024-04-03 10:15:13.1234'), (2, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00.0000'), (3, NULL, NULL, NULL, NULL, NULL); +# Rows 7-8: DATETIME values that are outside the TIMESTAMP range +# (year 1001 is before TIMESTAMP's 1970 start; year 9999 is well beyond +# its 2038 end). TIMESTAMP columns must be NULL — these values can't +# be stored there at all. +INSERT INTO products VALUES (7, NULL, NULL, '1001-01-01 00:00:00', '1001-01-01 00:00:00.000001', '1001-01-01 00:00:00.0001'), (8, NULL, NULL, '9999-12-31 23:59:59', '9999-12-31 23:59:59.999999', '9999-12-31 23:59:59.9999'); +COMMIT; + +> BEGIN +> CREATE SOURCE da + FROM MYSQL CONNECTION mysqc; +> CREATE TABLE products FROM SOURCE da (REFERENCE public.products) + WITH (TEXT COLUMNS = (created_at, updated_at, archived_at, born_at, mid_at)); +> COMMIT + +# Post-snapshot rows exercise the replication / binlog decode path, +# which uses a different mysql_common::Value variant than the snapshot. +$ mysql-execute name=mysql +USE public; +SET SESSION sql_mode = ''; +START TRANSACTION; +INSERT INTO products VALUES (4, '2025-04-03 09:01:53', '2025-04-03 09:01:53.987654', '2025-04-03 09:01:53', '2025-04-03 09:01:53.987654', '2025-04-03 09:01:53.9876'), (5, '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00', '0000-00-00 00:00:00.000000', '0000-00-00 00:00:00.0000'), (6, NULL, NULL, NULL, NULL, NULL); +INSERT INTO products VALUES (9, NULL, NULL, '1001-01-01 00:00:00', '1001-01-01 00:00:00.000001', '1001-01-01 00:00:00.0001'), (10, NULL, NULL, '9999-12-31 23:59:59', '9999-12-31 23:59:59.999999', '9999-12-31 23:59:59.9999'); +COMMIT; + +> SELECT id, created_at FROM products ORDER BY id; +1 "2024-04-03 10:15:13" +2 "0000-00-00 00:00:00" +3 +4 "2025-04-03 09:01:53" +5 "0000-00-00 00:00:00" +6 +7 +8 +9 +10 + +> SELECT id, updated_at FROM products ORDER BY id; +1 "2024-04-03 10:15:13.123456" +2 "0000-00-00 00:00:00.000000" +3 +4 "2025-04-03 09:01:53.987654" +5 "0000-00-00 00:00:00.000000" +6 +7 +8 +9 +10 + +> SELECT id, archived_at FROM products ORDER BY id; +1 "2024-04-03 10:15:13" +2 "0000-00-00 00:00:00" +3 +4 "2025-04-03 09:01:53" +5 "0000-00-00 00:00:00" +6 +7 "1001-01-01 00:00:00" +8 "9999-12-31 23:59:59" +9 "1001-01-01 00:00:00" +10 "9999-12-31 23:59:59" + +> SELECT id, born_at FROM products ORDER BY id; +1 "2024-04-03 10:15:13.123456" +2 "0000-00-00 00:00:00.000000" +3 +4 "2025-04-03 09:01:53.987654" +5 "0000-00-00 00:00:00.000000" +6 +7 "1001-01-01 00:00:00.000001" +8 "9999-12-31 23:59:59.999999" +9 "1001-01-01 00:00:00.000001" +10 "9999-12-31 23:59:59.999999" + +# DATETIME(4) — mid-precision case to exercise non-{0,6} fractional widths. +> SELECT id, mid_at FROM products ORDER BY id; +1 "2024-04-03 10:15:13.1234" +2 "0000-00-00 00:00:00.0000" +3 +4 "2025-04-03 09:01:53.9876" +5 "0000-00-00 00:00:00.0000" +6 +7 "1001-01-01 00:00:00.0001" +8 "9999-12-31 23:59:59.9999" +9 "1001-01-01 00:00:00.0001" +10 "9999-12-31 23:59:59.9999" + +# Verify the column types were rewritten to text by TEXT COLUMNS. +> SELECT pg_typeof(created_at), pg_typeof(updated_at), pg_typeof(archived_at), pg_typeof(born_at), pg_typeof(mid_at) FROM products LIMIT 1; +text text text text text + +# None of the data above should have caused the source to go into a stalled state. This may be +# slight overkill as a check. We've verified that all the rows are readable in previous +# queries above. +> SELECT name, status, error IS NULL FROM mz_internal.mz_source_statuses WHERE name IN ('da', 'products') ORDER BY name; +da running true +products running true + +> DROP SOURCE da CASCADE;