Skip to content

Commit 81d71e5

Browse files
g-cqdclaude
andcommitted
Add multi-encoding support with zero-copy optimization
- Implement encoding-aware string conversion in CSVRowView - Add CSVUtilities for encoding classification and BOM detection - Support ASCII-compatible encodings (ISO-8859-1, Windows-1252, etc.) with zero-copy parsing - only string conversion uses the encoding - Auto-transcode non-ASCII encodings (UTF-16, UTF-32) to UTF-8 before parsing - Add BOM detection for UTF-8, UTF-16 LE/BE, UTF-32 LE/BE - Pass effective encoding through CSVRowDecoder to CSVKeyedDecodingContainer - Add comprehensive encoding tests (Latin-1, Windows-1252, UTF-16, BOM handling) - Update Configuration.encoding documentation with supported encodings Performance characteristics: - UTF-8 (default): Zero-copy parsing, fast path - ASCII-compatible: Zero-copy parsing, encoding used only for string conversion - UTF-16/UTF-32: Single transcode to UTF-8, then zero-copy parsing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent bf7d898 commit 81d71e5

6 files changed

Lines changed: 319 additions & 35 deletions

File tree

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99

1010
### Added
1111

12+
#### Multi-Encoding Support
13+
- Full support for ASCII-compatible encodings (ISO-8859-1, Windows-1252, macOS Roman)
14+
- Automatic transcoding for non-ASCII encodings (UTF-16, UTF-16LE/BE, UTF-32)
15+
- BOM (Byte Order Mark) detection for UTF-8, UTF-16, and UTF-32
16+
- Zero-copy parsing preserved for ASCII-compatible encodings
17+
- `CSVUtilities.isASCIICompatible(_:)` for encoding classification
18+
- `CSVUtilities.transcodeToUTF8(_:from:)` for encoding conversion
19+
- `CSVUtilities.detectBOM(in:)` for multi-format BOM detection
20+
1221
#### Streaming & Memory Efficiency
1322
- `CSVDecoder.decode(_:from: URL)` - Stream decode from files with O(1) memory
1423
- `CSVDecoder.decode(_:from: Data)` - Stream decode from Data

Sources/CSVCoder/Core/CSVDecoder.swift

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,19 @@ public final class CSVDecoder: Sendable {
9797

9898
/// The encoding to use when reading data. Default is UTF-8.
9999
///
100-
/// - Note: The current implementation optimizes for UTF-8 with zero-copy parsing.
101-
/// Non-UTF-8 data should be converted to String using the appropriate encoding
102-
/// before passing to `decode(from:)`. This property is reserved for future use.
100+
/// CSVCoder supports two categories of encodings:
101+
///
102+
/// **ASCII-Compatible Encodings** (zero-copy parsing):
103+
/// - UTF-8, ASCII, ISO-8859-1 (Latin-1), Windows-1252, macOS Roman
104+
/// - These encodings use the same byte values for ASCII characters, allowing
105+
/// the parser to operate directly on raw bytes for maximum performance.
106+
///
107+
/// **Non-ASCII Encodings** (automatic transcoding):
108+
/// - UTF-16, UTF-16LE, UTF-16BE, UTF-32, UTF-32LE, UTF-32BE
109+
/// - These encodings are automatically transcoded to UTF-8 before parsing.
110+
/// This adds overhead but ensures correct handling of all Unicode data.
111+
///
112+
/// BOM (Byte Order Mark) detection is automatic for all supported encodings.
103113
public var encoding: String.Encoding
104114

105115
/// Whether to trim whitespace from field values. Default is true.
@@ -345,10 +355,33 @@ public final class CSVDecoder: Sendable {
345355
from data: Data,
346356
columnOrder: [String]?
347357
) throws -> [T] {
348-
return try data.withUnsafeBytes { buffer in
358+
// Check if encoding requires transcoding
359+
let encoding = configuration.encoding
360+
let isASCIICompatible = CSVUtilities.isASCIICompatible(encoding)
361+
362+
// For non-ASCII encodings (UTF-16, UTF-32), transcode to UTF-8 first
363+
let effectiveData: Data
364+
let effectiveEncoding: String.Encoding
365+
366+
if !isASCIICompatible {
367+
guard let transcoded = CSVUtilities.transcodeToUTF8(data, from: encoding) else {
368+
throw CSVDecodingError.parsingError(
369+
"Failed to transcode data from \(encoding) to UTF-8",
370+
line: nil,
371+
column: nil
372+
)
373+
}
374+
effectiveData = transcoded
375+
effectiveEncoding = .utf8 // After transcoding, we're working with UTF-8
376+
} else {
377+
effectiveData = data
378+
effectiveEncoding = encoding
379+
}
380+
381+
return try effectiveData.withUnsafeBytes { buffer in
349382
guard let baseAddress = buffer.baseAddress else { return [] }
350383

351-
// Handle UTF-8 BOM
384+
// Handle BOM (UTF-8 BOM for transcoded data, original BOM was handled during transcode)
352385
let rawBytes = UnsafeBufferPointer(
353386
start: baseAddress.assumingMemoryBound(to: UInt8.self),
354387
count: buffer.count
@@ -386,12 +419,12 @@ public final class CSVDecoder: Sendable {
386419

387420
guard !rows.isEmpty else { return [] }
388421

389-
// Extract raw headers from first row
422+
// Extract raw headers from first row using the effective encoding
390423
let firstRow = rows[0]
391424
var rawHeaders: [String] = []
392425
rawHeaders.reserveCapacity(firstRow.count)
393426
for i in 0..<firstRow.count {
394-
if let s = firstRow.string(at: i) {
427+
if let s = firstRow.string(at: i, encoding: effectiveEncoding) {
395428
// Apply trimWhitespace to headers for consistency with parallel decoding
396429
rawHeaders.append(configuration.trimWhitespace ? s.trimmingCharacters(in: .whitespaces) : s)
397430
} else {
@@ -426,7 +459,8 @@ public final class CSVDecoder: Sendable {
426459
headerMap: headerMap,
427460
configuration: configuration,
428461
codingPath: [],
429-
rowIndex: i + 1
462+
rowIndex: i + 1,
463+
encoding: effectiveEncoding
430464
)
431465
results.append(try T(from: decoder))
432466
}

Sources/CSVCoder/Decoder/CSVRowDecoder.swift

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,29 +13,33 @@ struct CSVRowDecoder: Decoder {
1313
case dictionary([String: String])
1414
case view(CSVRowView, headerMap: [String: Int])
1515
}
16-
16+
1717
let source: RowSource
1818
let configuration: CSVDecoder.Configuration
1919
let codingPath: [CodingKey]
2020
let rowIndex: Int?
21+
/// The effective encoding to use for string conversion (may differ from configuration.encoding after transcoding).
22+
let encoding: String.Encoding
2123
var userInfo: [CodingUserInfoKey: Any] { [:] }
2224

2325
init(row: [String: String], configuration: CSVDecoder.Configuration, codingPath: [CodingKey], rowIndex: Int? = nil) {
2426
self.source = .dictionary(row)
2527
self.configuration = configuration
2628
self.codingPath = codingPath
2729
self.rowIndex = rowIndex
30+
self.encoding = .utf8 // Dictionary source uses pre-decoded strings
2831
}
29-
30-
init(view: CSVRowView, headerMap: [String: Int], configuration: CSVDecoder.Configuration, codingPath: [CodingKey], rowIndex: Int? = nil) {
32+
33+
init(view: CSVRowView, headerMap: [String: Int], configuration: CSVDecoder.Configuration, codingPath: [CodingKey], rowIndex: Int? = nil, encoding: String.Encoding = .utf8) {
3134
self.source = .view(view, headerMap: headerMap)
3235
self.configuration = configuration
3336
self.codingPath = codingPath
3437
self.rowIndex = rowIndex
38+
self.encoding = encoding
3539
}
3640

3741
func container<Key: CodingKey>(keyedBy type: Key.Type) throws -> KeyedDecodingContainer<Key> {
38-
KeyedDecodingContainer(CSVKeyedDecodingContainer(source: source, configuration: configuration, codingPath: codingPath, rowIndex: rowIndex))
42+
KeyedDecodingContainer(CSVKeyedDecodingContainer(source: source, configuration: configuration, codingPath: codingPath, rowIndex: rowIndex, encoding: encoding))
3943
}
4044

4145
func unkeyedContainer() throws -> UnkeyedDecodingContainer {
@@ -54,13 +58,16 @@ struct CSVKeyedDecodingContainer<Key: CodingKey>: KeyedDecodingContainerProtocol
5458
let codingPath: [CodingKey]
5559
let rowIndex: Int?
5660
let keyPrefix: String?
61+
/// The effective encoding to use for string conversion from CSVRowView.
62+
let encoding: String.Encoding
5763

58-
init(source: CSVRowDecoder.RowSource, configuration: CSVDecoder.Configuration, codingPath: [CodingKey], rowIndex: Int?, keyPrefix: String? = nil) {
64+
init(source: CSVRowDecoder.RowSource, configuration: CSVDecoder.Configuration, codingPath: [CodingKey], rowIndex: Int?, keyPrefix: String? = nil, encoding: String.Encoding = .utf8) {
5965
self.source = source
6066
self.configuration = configuration
6167
self.codingPath = codingPath
6268
self.rowIndex = rowIndex
6369
self.keyPrefix = keyPrefix
70+
self.encoding = encoding
6471
}
6572

6673
var allKeys: [Key] {
@@ -116,8 +123,8 @@ struct CSVKeyedDecodingContainer<Key: CodingKey>: KeyedDecodingContainerProtocol
116123
location: makeLocation(for: key, includeAvailableKeys: true)
117124
)
118125
}
119-
// Decode string on demand
120-
guard let value = view.string(at: index) else {
126+
// Decode string on demand using the effective encoding
127+
guard let value = view.string(at: index, encoding: encoding) else {
121128
throw CSVDecodingError.keyNotFound(
122129
key.stringValue,
123130
location: makeLocation(for: key, includeAvailableKeys: true)

Sources/CSVCoder/Parsing/CSVParser.swift

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -93,35 +93,55 @@ public struct CSVRowView {
9393
/// - Returns: The decoded string value, or `nil` if the index is out of bounds.
9494
/// - Complexity: O(1) for unquoted fields; O(n) for quoted fields with escaped quotes.
9595
public func string(at index: Int) -> String? {
96+
string(at: index, encoding: .utf8)
97+
}
98+
99+
/// Decodes and returns the string value for the field at the given index using the specified encoding.
100+
///
101+
/// Handles RFC 4180 quote unescaping automatically:
102+
/// - Quoted fields have outer quotes stripped
103+
/// - Escaped quotes (`""`) are converted to single quotes (`"`)
104+
///
105+
/// - Parameters:
106+
/// - index: The zero-based field index.
107+
/// - encoding: The string encoding to use for conversion. For best performance, use `.utf8`.
108+
/// - Returns: The decoded string value, or `nil` if the index is out of bounds or conversion fails.
109+
/// - Complexity: O(1) for unquoted UTF-8 fields; O(n) for quoted fields with escaped quotes or non-UTF-8 encodings.
110+
public func string(at index: Int, encoding: String.Encoding) -> String? {
96111
guard index < fieldStarts.count else { return nil }
97-
112+
98113
let start = fieldStarts[index]
99114
let length = fieldLengths[index]
100115
let isQuoted = fieldQuoted[index]
101116
let hasEscapedQuote = fieldHasEscapedQuote[index]
102-
117+
103118
guard let base = buffer.baseAddress else { return nil }
104-
105-
if isQuoted {
106-
// Must unescape: replace "" with "
107-
108-
// Optimization: if no internal escaped quotes, just strip outer quotes
109-
// Note: The parser logic returns contentStart and contentLength (excluding outer quotes)
110-
// So we can just create the string directly if no internal escapes!
111-
if !hasEscapedQuote {
119+
120+
// Fast path for UTF-8 (most common case)
121+
if encoding == .utf8 {
122+
if isQuoted {
123+
if !hasEscapedQuote {
124+
let ptr = base.advanced(by: start)
125+
return String(decoding: UnsafeBufferPointer(start: ptr, count: length), as: UTF8.self)
126+
}
127+
let fieldBytes = UnsafeBufferPointer(start: base.advanced(by: start), count: length)
128+
let s = String(decoding: fieldBytes, as: UTF8.self)
129+
return s.replacingOccurrences(of: "\"\"", with: "\"")
130+
} else {
112131
let ptr = base.advanced(by: start)
113132
return String(decoding: UnsafeBufferPointer(start: ptr, count: length), as: UTF8.self)
114133
}
115-
116-
// Slow path: contains escaped quotes "" -> "
117-
let fieldBytes = UnsafeBufferPointer(start: base.advanced(by: start), count: length)
118-
let s = String(decoding: fieldBytes, as: UTF8.self)
119-
return s.replacingOccurrences(of: "\"\"", with: "\"")
120-
} else {
121-
// Zero-copy string creation if possible (Swift 5.x strings are fast to create from UTF8)
122-
let ptr = base.advanced(by: start)
123-
return String(decoding: UnsafeBufferPointer(start: ptr, count: length), as: UTF8.self)
124134
}
135+
136+
// Non-UTF-8 encoding path (ASCII-compatible encodings like ISO-8859-1, Windows-1252)
137+
let ptr = base.advanced(by: start)
138+
let data = Data(bytes: ptr, count: length)
139+
guard let result = String(data: data, encoding: encoding) else { return nil }
140+
141+
if isQuoted && hasEscapedQuote {
142+
return result.replacingOccurrences(of: "\"\"", with: "\"")
143+
}
144+
return result
125145
}
126146
}
127147

Sources/CSVCoder/Utilities/CSVUtilities.swift

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,25 @@
88

99
import Foundation
1010

11-
// MARK: - BOM Handling
11+
// MARK: - BOM Handling & Encoding Utilities
1212

1313
/// Shared utilities for CSV operations.
1414
enum CSVUtilities {
1515
/// UTF-8 BOM bytes (EF BB BF).
1616
static let utf8BOM: (UInt8, UInt8, UInt8) = (0xEF, 0xBB, 0xBF)
1717

18+
/// UTF-16 LE BOM bytes (FF FE).
19+
static let utf16LEBOM: (UInt8, UInt8) = (0xFF, 0xFE)
20+
21+
/// UTF-16 BE BOM bytes (FE FF).
22+
static let utf16BEBOM: (UInt8, UInt8) = (0xFE, 0xFF)
23+
24+
/// UTF-32 LE BOM bytes (FF FE 00 00).
25+
static let utf32LEBOM: (UInt8, UInt8, UInt8, UInt8) = (0xFF, 0xFE, 0x00, 0x00)
26+
27+
/// UTF-32 BE BOM bytes (00 00 FE FF).
28+
static let utf32BEBOM: (UInt8, UInt8, UInt8, UInt8) = (0x00, 0x00, 0xFE, 0xFF)
29+
1830
/// Returns the byte offset to skip UTF-8 BOM if present.
1931
/// - Parameter bytes: The buffer to check.
2032
/// - Returns: 3 if BOM is present, 0 otherwise.
@@ -44,6 +56,89 @@ enum CSVUtilities {
4456
}
4557
return 3
4658
}
59+
60+
/// Detects encoding from BOM and returns the encoding and byte offset to skip.
61+
/// - Parameter data: The data to check for BOM.
62+
/// - Returns: A tuple of detected encoding (or nil if no BOM) and the byte offset to skip.
63+
static func detectBOM(in data: Data) -> (encoding: String.Encoding?, offset: Int) {
64+
guard data.count >= 2 else { return (nil, 0) }
65+
66+
// Check UTF-32 first (4-byte BOM, but first 2 bytes overlap with UTF-16 LE)
67+
if data.count >= 4 {
68+
if data[0] == utf32LEBOM.0 && data[1] == utf32LEBOM.1 &&
69+
data[2] == utf32LEBOM.2 && data[3] == utf32LEBOM.3 {
70+
return (.utf32LittleEndian, 4)
71+
}
72+
if data[0] == utf32BEBOM.0 && data[1] == utf32BEBOM.1 &&
73+
data[2] == utf32BEBOM.2 && data[3] == utf32BEBOM.3 {
74+
return (.utf32BigEndian, 4)
75+
}
76+
}
77+
78+
// Check UTF-8 (3-byte BOM)
79+
if data.count >= 3 {
80+
if data[0] == utf8BOM.0 && data[1] == utf8BOM.1 && data[2] == utf8BOM.2 {
81+
return (.utf8, 3)
82+
}
83+
}
84+
85+
// Check UTF-16 (2-byte BOM)
86+
if data[0] == utf16LEBOM.0 && data[1] == utf16LEBOM.1 {
87+
return (.utf16LittleEndian, 2)
88+
}
89+
if data[0] == utf16BEBOM.0 && data[1] == utf16BEBOM.1 {
90+
return (.utf16BigEndian, 2)
91+
}
92+
93+
return (nil, 0)
94+
}
95+
96+
/// Checks if an encoding uses ASCII-compatible byte values for structural characters.
97+
///
98+
/// ASCII-compatible encodings use the same byte values (0x00-0x7F) for ASCII characters,
99+
/// which means CSV structural characters (comma, quote, CR, LF) have identical byte representations.
100+
/// This allows the parser to operate on raw bytes and only use encoding for string conversion.
101+
///
102+
/// - Parameter encoding: The encoding to check.
103+
/// - Returns: `true` if the encoding is ASCII-compatible.
104+
@inline(__always)
105+
static func isASCIICompatible(_ encoding: String.Encoding) -> Bool {
106+
switch encoding {
107+
case .utf8, .ascii, .isoLatin1, .isoLatin2,
108+
.windowsCP1250, .windowsCP1251, .windowsCP1252,
109+
.windowsCP1253, .windowsCP1254,
110+
.macOSRoman, .nextstep:
111+
return true
112+
case .utf16, .utf16BigEndian, .utf16LittleEndian,
113+
.utf32, .utf32BigEndian, .utf32LittleEndian,
114+
.unicode:
115+
return false
116+
default:
117+
// For unknown encodings, assume not ASCII-compatible for safety
118+
return false
119+
}
120+
}
121+
122+
/// Transcodes data from a non-ASCII-compatible encoding to UTF-8.
123+
///
124+
/// For encodings like UTF-16 and UTF-32, the byte structure differs from ASCII,
125+
/// so we must convert to String first, then to UTF-8 bytes for parsing.
126+
///
127+
/// - Parameters:
128+
/// - data: The source data.
129+
/// - encoding: The source encoding.
130+
/// - Returns: UTF-8 encoded data, or nil if conversion fails.
131+
static func transcodeToUTF8(_ data: Data, from encoding: String.Encoding) -> Data? {
132+
// Try to detect and skip BOM
133+
let (detectedEncoding, bomOffset) = detectBOM(in: data)
134+
let effectiveEncoding = detectedEncoding ?? encoding
135+
let dataWithoutBOM = bomOffset > 0 ? data.dropFirst(bomOffset) : data
136+
137+
guard let string = String(data: Data(dataWithoutBOM), encoding: effectiveEncoding) else {
138+
return nil
139+
}
140+
return string.data(using: .utf8)
141+
}
47142
}
48143

49144
// MARK: - Field Escaping

0 commit comments

Comments
 (0)