Add multi-encoding support with zero-copy optimization

g-cqd · claude · g-cqd · commit 81d71e5cd728 · 2025-12-28T19:00:37.000+01:00
- Implement encoding-aware string conversion in CSVRowView - Add CSVUtilities for encoding classification and BOM detection - Support ASCII-compatible encodings (ISO-8859-1, Windows-1252, etc.) with zero-copy parsing - only string conversion uses the encoding - Auto-transcode non-ASCII encodings (UTF-16, UTF-32) to UTF-8 before parsing - Add BOM detection for UTF-8, UTF-16 LE/BE, UTF-32 LE/BE - Pass effective encoding through CSVRowDecoder to CSVKeyedDecodingContainer - Add comprehensive encoding tests (Latin-1, Windows-1252, UTF-16, BOM handling) - Update Configuration.encoding documentation with supported encodings Performance characteristics: - UTF-8 (default): Zero-copy parsing, fast path - ASCII-compatible: Zero-copy parsing, encoding used only for string conversion - UTF-16/UTF-32: Single transcode to UTF-8, then zero-copy parsing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+#### Multi-Encoding Support
+- Full support for ASCII-compatible encodings (ISO-8859-1, Windows-1252, macOS Roman)
+- Automatic transcoding for non-ASCII encodings (UTF-16, UTF-16LE/BE, UTF-32)
+- BOM (Byte Order Mark) detection for UTF-8, UTF-16, and UTF-32
+- Zero-copy parsing preserved for ASCII-compatible encodings
+- `CSVUtilities.isASCIICompatible(_:)` for encoding classification
+- `CSVUtilities.transcodeToUTF8(_:from:)` for encoding conversion
+- `CSVUtilities.detectBOM(in:)` for multi-format BOM detection
+
 #### Streaming & Memory Efficiency
 - `CSVDecoder.decode(_:from: URL)` - Stream decode from files with O(1) memory
 - `CSVDecoder.decode(_:from: Data)` - Stream decode from Data
diff --git a/Sources/CSVCoder/Core/CSVDecoder.swift b/Sources/CSVCoder/Core/CSVDecoder.swift
@@ -97,9 +97,19 @@ public final class CSVDecoder: Sendable {
 
         /// The encoding to use when reading data. Default is UTF-8.
         ///
-        /// - Note: The current implementation optimizes for UTF-8 with zero-copy parsing.
-        ///   Non-UTF-8 data should be converted to String using the appropriate encoding
-        ///   before passing to `decode(from:)`. This property is reserved for future use.
+        /// CSVCoder supports two categories of encodings:
+        ///
+        /// **ASCII-Compatible Encodings** (zero-copy parsing):
+        /// - UTF-8, ASCII, ISO-8859-1 (Latin-1), Windows-1252, macOS Roman
+        /// - These encodings use the same byte values for ASCII characters, allowing
+        ///   the parser to operate directly on raw bytes for maximum performance.
+        ///
+        /// **Non-ASCII Encodings** (automatic transcoding):
+        /// - UTF-16, UTF-16LE, UTF-16BE, UTF-32, UTF-32LE, UTF-32BE
+        /// - These encodings are automatically transcoded to UTF-8 before parsing.
+        ///   This adds overhead but ensures correct handling of all Unicode data.
+        ///
+        /// BOM (Byte Order Mark) detection is automatic for all supported encodings.
         public var encoding: String.Encoding
 
         /// Whether to trim whitespace from field values. Default is true.
@@ -345,10 +355,33 @@ public final class CSVDecoder: Sendable {
         from data: Data,
         columnOrder: [String]?
     ) throws -> [T] {
-        return try data.withUnsafeBytes { buffer in
+        // Check if encoding requires transcoding
+        let encoding = configuration.encoding
+        let isASCIICompatible = CSVUtilities.isASCIICompatible(encoding)
+
+        // For non-ASCII encodings (UTF-16, UTF-32), transcode to UTF-8 first
+        let effectiveData: Data
+        let effectiveEncoding: String.Encoding
+
+        if !isASCIICompatible {
+            guard let transcoded = CSVUtilities.transcodeToUTF8(data, from: encoding) else {
+                throw CSVDecodingError.parsingError(
+                    "Failed to transcode data from \(encoding) to UTF-8",
+                    line: nil,
+                    column: nil
+                )
+            }
+            effectiveData = transcoded
+            effectiveEncoding = .utf8  // After transcoding, we're working with UTF-8
+        } else {
+            effectiveData = data
+            effectiveEncoding = encoding
+        }
+
+        return try effectiveData.withUnsafeBytes { buffer in
             guard let baseAddress = buffer.baseAddress else { return [] }
 
-            // Handle UTF-8 BOM
+            // Handle BOM (UTF-8 BOM for transcoded data, original BOM was handled during transcode)
             let rawBytes = UnsafeBufferPointer(
                 start: baseAddress.assumingMemoryBound(to: UInt8.self),
                 count: buffer.count
@@ -386,12 +419,12 @@ public final class CSVDecoder: Sendable {
 
             guard !rows.isEmpty else { return [] }
 
-            // Extract raw headers from first row
+            // Extract raw headers from first row using the effective encoding
             let firstRow = rows[0]
             var rawHeaders: [String] = []
             rawHeaders.reserveCapacity(firstRow.count)
             for i in 0..<firstRow.count {
-                if let s = firstRow.string(at: i) {
+                if let s = firstRow.string(at: i, encoding: effectiveEncoding) {
                     // Apply trimWhitespace to headers for consistency with parallel decoding
                     rawHeaders.append(configuration.trimWhitespace ? s.trimmingCharacters(in: .whitespaces) : s)
                 } else {
@@ -426,7 +459,8 @@ public final class CSVDecoder: Sendable {
                     headerMap: headerMap,
                     configuration: configuration,
                     codingPath: [],
-                    rowIndex: i + 1
+                    rowIndex: i + 1,
+                    encoding: effectiveEncoding
                 )
                 results.append(try T(from: decoder))
             }
diff --git a/Sources/CSVCoder/Decoder/CSVRowDecoder.swift b/Sources/CSVCoder/Decoder/CSVRowDecoder.swift
@@ -13,29 +13,33 @@ struct CSVRowDecoder: Decoder {
         case dictionary([String: String])
         case view(CSVRowView, headerMap: [String: Int])
     }
-    
+
     let source: RowSource
     let configuration: CSVDecoder.Configuration
     let codingPath: [CodingKey]
     let rowIndex: Int?
+    /// The effective encoding to use for string conversion (may differ from configuration.encoding after transcoding).
+    let encoding: String.Encoding
     var userInfo: [CodingUserInfoKey: Any] { [:] }
 
     init(row: [String: String], configuration: CSVDecoder.Configuration, codingPath: [CodingKey], rowIndex: Int? = nil) {
         self.source = .dictionary(row)
         self.configuration = configuration
         self.codingPath = codingPath
         self.rowIndex = rowIndex
+        self.encoding = .utf8  // Dictionary source uses pre-decoded strings
     }
-    
-    init(view: CSVRowView, headerMap: [String: Int], configuration: CSVDecoder.Configuration, codingPath: [CodingKey], rowIndex: Int? = nil) {
+
+    init(view: CSVRowView, headerMap: [String: Int], configuration: CSVDecoder.Configuration, codingPath: [CodingKey], rowIndex: Int? = nil, encoding: String.Encoding = .utf8) {
         self.source = .view(view, headerMap: headerMap)
         self.configuration = configuration
         self.codingPath = codingPath
         self.rowIndex = rowIndex
+        self.encoding = encoding
     }
 
     func container<Key: CodingKey>(keyedBy type: Key.Type) throws -> KeyedDecodingContainer<Key> {
-        KeyedDecodingContainer(CSVKeyedDecodingContainer(source: source, configuration: configuration, codingPath: codingPath, rowIndex: rowIndex))
+        KeyedDecodingContainer(CSVKeyedDecodingContainer(source: source, configuration: configuration, codingPath: codingPath, rowIndex: rowIndex, encoding: encoding))
     }
 
     func unkeyedContainer() throws -> UnkeyedDecodingContainer {
@@ -54,13 +58,16 @@ struct CSVKeyedDecodingContainer<Key: CodingKey>: KeyedDecodingContainerProtocol
     let codingPath: [CodingKey]
     let rowIndex: Int?
     let keyPrefix: String?
+    /// The effective encoding to use for string conversion from CSVRowView.
+    let encoding: String.Encoding
 
-    init(source: CSVRowDecoder.RowSource, configuration: CSVDecoder.Configuration, codingPath: [CodingKey], rowIndex: Int?, keyPrefix: String? = nil) {
+    init(source: CSVRowDecoder.RowSource, configuration: CSVDecoder.Configuration, codingPath: [CodingKey], rowIndex: Int?, keyPrefix: String? = nil, encoding: String.Encoding = .utf8) {
         self.source = source
         self.configuration = configuration
         self.codingPath = codingPath
         self.rowIndex = rowIndex
         self.keyPrefix = keyPrefix
+        self.encoding = encoding
     }
 
     var allKeys: [Key] {
@@ -116,8 +123,8 @@ struct CSVKeyedDecodingContainer<Key: CodingKey>: KeyedDecodingContainerProtocol
                     location: makeLocation(for: key, includeAvailableKeys: true)
                 )
             }
-            // Decode string on demand
-            guard let value = view.string(at: index) else {
+            // Decode string on demand using the effective encoding
+            guard let value = view.string(at: index, encoding: encoding) else {
                 throw CSVDecodingError.keyNotFound(
                     key.stringValue,
                     location: makeLocation(for: key, includeAvailableKeys: true)
diff --git a/Sources/CSVCoder/Parsing/CSVParser.swift b/Sources/CSVCoder/Parsing/CSVParser.swift
@@ -93,35 +93,55 @@ public struct CSVRowView {
     /// - Returns: The decoded string value, or `nil` if the index is out of bounds.
     /// - Complexity: O(1) for unquoted fields; O(n) for quoted fields with escaped quotes.
     public func string(at index: Int) -> String? {
+        string(at: index, encoding: .utf8)
+    }
+
+    /// Decodes and returns the string value for the field at the given index using the specified encoding.
+    ///
+    /// Handles RFC 4180 quote unescaping automatically:
+    /// - Quoted fields have outer quotes stripped
+    /// - Escaped quotes (`""`) are converted to single quotes (`"`)
+    ///
+    /// - Parameters:
+    ///   - index: The zero-based field index.
+    ///   - encoding: The string encoding to use for conversion. For best performance, use `.utf8`.
+    /// - Returns: The decoded string value, or `nil` if the index is out of bounds or conversion fails.
+    /// - Complexity: O(1) for unquoted UTF-8 fields; O(n) for quoted fields with escaped quotes or non-UTF-8 encodings.
+    public func string(at index: Int, encoding: String.Encoding) -> String? {
         guard index < fieldStarts.count else { return nil }
-        
+
         let start = fieldStarts[index]
         let length = fieldLengths[index]
         let isQuoted = fieldQuoted[index]
         let hasEscapedQuote = fieldHasEscapedQuote[index]
-        
+
         guard let base = buffer.baseAddress else { return nil }
-        
-        if isQuoted {
-            // Must unescape: replace "" with "
-            
-            // Optimization: if no internal escaped quotes, just strip outer quotes
-            // Note: The parser logic returns contentStart and contentLength (excluding outer quotes)
-            // So we can just create the string directly if no internal escapes!
-            if !hasEscapedQuote {
+
+        // Fast path for UTF-8 (most common case)
+        if encoding == .utf8 {
+            if isQuoted {
+                if !hasEscapedQuote {
+                    let ptr = base.advanced(by: start)
+                    return String(decoding: UnsafeBufferPointer(start: ptr, count: length), as: UTF8.self)
+                }
+                let fieldBytes = UnsafeBufferPointer(start: base.advanced(by: start), count: length)
+                let s = String(decoding: fieldBytes, as: UTF8.self)
+                return s.replacingOccurrences(of: "\"\"", with: "\"")
+            } else {
                 let ptr = base.advanced(by: start)
                 return String(decoding: UnsafeBufferPointer(start: ptr, count: length), as: UTF8.self)
             }
-            
-            // Slow path: contains escaped quotes "" -> "
-            let fieldBytes = UnsafeBufferPointer(start: base.advanced(by: start), count: length)
-            let s = String(decoding: fieldBytes, as: UTF8.self)
-            return s.replacingOccurrences(of: "\"\"", with: "\"")
-        } else {
-            // Zero-copy string creation if possible (Swift 5.x strings are fast to create from UTF8)
-            let ptr = base.advanced(by: start)
-            return String(decoding: UnsafeBufferPointer(start: ptr, count: length), as: UTF8.self)
         }
+
+        // Non-UTF-8 encoding path (ASCII-compatible encodings like ISO-8859-1, Windows-1252)
+        let ptr = base.advanced(by: start)
+        let data = Data(bytes: ptr, count: length)
+        guard let result = String(data: data, encoding: encoding) else { return nil }
+
+        if isQuoted && hasEscapedQuote {
+            return result.replacingOccurrences(of: "\"\"", with: "\"")
+        }
+        return result
     }
 }
 
diff --git a/Sources/CSVCoder/Utilities/CSVUtilities.swift b/Sources/CSVCoder/Utilities/CSVUtilities.swift
@@ -8,13 +8,25 @@
 
 import Foundation
 
-// MARK: - BOM Handling
+// MARK: - BOM Handling & Encoding Utilities
 
 /// Shared utilities for CSV operations.
 enum CSVUtilities {
     /// UTF-8 BOM bytes (EF BB BF).
     static let utf8BOM: (UInt8, UInt8, UInt8) = (0xEF, 0xBB, 0xBF)
 
+    /// UTF-16 LE BOM bytes (FF FE).
+    static let utf16LEBOM: (UInt8, UInt8) = (0xFF, 0xFE)
+
+    /// UTF-16 BE BOM bytes (FE FF).
+    static let utf16BEBOM: (UInt8, UInt8) = (0xFE, 0xFF)
+
+    /// UTF-32 LE BOM bytes (FF FE 00 00).
+    static let utf32LEBOM: (UInt8, UInt8, UInt8, UInt8) = (0xFF, 0xFE, 0x00, 0x00)
+
+    /// UTF-32 BE BOM bytes (00 00 FE FF).
+    static let utf32BEBOM: (UInt8, UInt8, UInt8, UInt8) = (0x00, 0x00, 0xFE, 0xFF)
+
     /// Returns the byte offset to skip UTF-8 BOM if present.
     /// - Parameter bytes: The buffer to check.
     /// - Returns: 3 if BOM is present, 0 otherwise.
@@ -44,6 +56,89 @@ enum CSVUtilities {
         }
         return 3
     }
+
+    /// Detects encoding from BOM and returns the encoding and byte offset to skip.
+    /// - Parameter data: The data to check for BOM.
+    /// - Returns: A tuple of detected encoding (or nil if no BOM) and the byte offset to skip.
+    static func detectBOM(in data: Data) -> (encoding: String.Encoding?, offset: Int) {
+        guard data.count >= 2 else { return (nil, 0) }
+
+        // Check UTF-32 first (4-byte BOM, but first 2 bytes overlap with UTF-16 LE)
+        if data.count >= 4 {
+            if data[0] == utf32LEBOM.0 && data[1] == utf32LEBOM.1 &&
+               data[2] == utf32LEBOM.2 && data[3] == utf32LEBOM.3 {
+                return (.utf32LittleEndian, 4)
+            }
+            if data[0] == utf32BEBOM.0 && data[1] == utf32BEBOM.1 &&
+               data[2] == utf32BEBOM.2 && data[3] == utf32BEBOM.3 {
+                return (.utf32BigEndian, 4)
+            }
+        }
+
+        // Check UTF-8 (3-byte BOM)
+        if data.count >= 3 {
+            if data[0] == utf8BOM.0 && data[1] == utf8BOM.1 && data[2] == utf8BOM.2 {
+                return (.utf8, 3)
+            }
+        }
+
+        // Check UTF-16 (2-byte BOM)
+        if data[0] == utf16LEBOM.0 && data[1] == utf16LEBOM.1 {
+            return (.utf16LittleEndian, 2)
+        }
+        if data[0] == utf16BEBOM.0 && data[1] == utf16BEBOM.1 {
+            return (.utf16BigEndian, 2)
+        }
+
+        return (nil, 0)
+    }
+
+    /// Checks if an encoding uses ASCII-compatible byte values for structural characters.
+    ///
+    /// ASCII-compatible encodings use the same byte values (0x00-0x7F) for ASCII characters,
+    /// which means CSV structural characters (comma, quote, CR, LF) have identical byte representations.
+    /// This allows the parser to operate on raw bytes and only use encoding for string conversion.
+    ///
+    /// - Parameter encoding: The encoding to check.
+    /// - Returns: `true` if the encoding is ASCII-compatible.
+    @inline(__always)
+    static func isASCIICompatible(_ encoding: String.Encoding) -> Bool {
+        switch encoding {
+        case .utf8, .ascii, .isoLatin1, .isoLatin2,
+             .windowsCP1250, .windowsCP1251, .windowsCP1252,
+             .windowsCP1253, .windowsCP1254,
+             .macOSRoman, .nextstep:
+            return true
+        case .utf16, .utf16BigEndian, .utf16LittleEndian,
+             .utf32, .utf32BigEndian, .utf32LittleEndian,
+             .unicode:
+            return false
+        default:
+            // For unknown encodings, assume not ASCII-compatible for safety
+            return false
+        }
+    }
+
+    /// Transcodes data from a non-ASCII-compatible encoding to UTF-8.
+    ///
+    /// For encodings like UTF-16 and UTF-32, the byte structure differs from ASCII,
+    /// so we must convert to String first, then to UTF-8 bytes for parsing.
+    ///
+    /// - Parameters:
+    ///   - data: The source data.
+    ///   - encoding: The source encoding.
+    /// - Returns: UTF-8 encoded data, or nil if conversion fails.
+    static func transcodeToUTF8(_ data: Data, from encoding: String.Encoding) -> Data? {
+        // Try to detect and skip BOM
+        let (detectedEncoding, bomOffset) = detectBOM(in: data)
+        let effectiveEncoding = detectedEncoding ?? encoding
+        let dataWithoutBOM = bomOffset > 0 ? data.dropFirst(bomOffset) : data
+
+        guard let string = String(data: Data(dataWithoutBOM), encoding: effectiveEncoding) else {
+            return nil
+        }
+        return string.data(using: .utf8)
+    }
 }
 
 // MARK: - Field Escaping
diff --git a/Tests/CSVCoderTests/CSVDecoderBasicTests.swift b/Tests/CSVCoderTests/CSVDecoderBasicTests.swift