|
8 | 8 |
|
9 | 9 | import Foundation |
10 | 10 |
|
11 | | -// MARK: - BOM Handling |
| 11 | +// MARK: - BOM Handling & Encoding Utilities |
12 | 12 |
|
13 | 13 | /// Shared utilities for CSV operations. |
14 | 14 | enum CSVUtilities { |
15 | 15 | /// UTF-8 BOM bytes (EF BB BF). |
16 | 16 | static let utf8BOM: (UInt8, UInt8, UInt8) = (0xEF, 0xBB, 0xBF) |
17 | 17 |
|
| 18 | + /// UTF-16 LE BOM bytes (FF FE). |
| 19 | + static let utf16LEBOM: (UInt8, UInt8) = (0xFF, 0xFE) |
| 20 | + |
| 21 | + /// UTF-16 BE BOM bytes (FE FF). |
| 22 | + static let utf16BEBOM: (UInt8, UInt8) = (0xFE, 0xFF) |
| 23 | + |
| 24 | + /// UTF-32 LE BOM bytes (FF FE 00 00). |
| 25 | + static let utf32LEBOM: (UInt8, UInt8, UInt8, UInt8) = (0xFF, 0xFE, 0x00, 0x00) |
| 26 | + |
| 27 | + /// UTF-32 BE BOM bytes (00 00 FE FF). |
| 28 | + static let utf32BEBOM: (UInt8, UInt8, UInt8, UInt8) = (0x00, 0x00, 0xFE, 0xFF) |
| 29 | + |
18 | 30 | /// Returns the byte offset to skip UTF-8 BOM if present. |
19 | 31 | /// - Parameter bytes: The buffer to check. |
20 | 32 | /// - Returns: 3 if BOM is present, 0 otherwise. |
@@ -44,6 +56,89 @@ enum CSVUtilities { |
44 | 56 | } |
45 | 57 | return 3 |
46 | 58 | } |
| 59 | + |
| 60 | + /// Detects encoding from BOM and returns the encoding and byte offset to skip. |
| 61 | + /// - Parameter data: The data to check for BOM. |
| 62 | + /// - Returns: A tuple of detected encoding (or nil if no BOM) and the byte offset to skip. |
| 63 | + static func detectBOM(in data: Data) -> (encoding: String.Encoding?, offset: Int) { |
| 64 | + guard data.count >= 2 else { return (nil, 0) } |
| 65 | + |
| 66 | + // Check UTF-32 first (4-byte BOM, but first 2 bytes overlap with UTF-16 LE) |
| 67 | + if data.count >= 4 { |
| 68 | + if data[0] == utf32LEBOM.0 && data[1] == utf32LEBOM.1 && |
| 69 | + data[2] == utf32LEBOM.2 && data[3] == utf32LEBOM.3 { |
| 70 | + return (.utf32LittleEndian, 4) |
| 71 | + } |
| 72 | + if data[0] == utf32BEBOM.0 && data[1] == utf32BEBOM.1 && |
| 73 | + data[2] == utf32BEBOM.2 && data[3] == utf32BEBOM.3 { |
| 74 | + return (.utf32BigEndian, 4) |
| 75 | + } |
| 76 | + } |
| 77 | + |
| 78 | + // Check UTF-8 (3-byte BOM) |
| 79 | + if data.count >= 3 { |
| 80 | + if data[0] == utf8BOM.0 && data[1] == utf8BOM.1 && data[2] == utf8BOM.2 { |
| 81 | + return (.utf8, 3) |
| 82 | + } |
| 83 | + } |
| 84 | + |
| 85 | + // Check UTF-16 (2-byte BOM) |
| 86 | + if data[0] == utf16LEBOM.0 && data[1] == utf16LEBOM.1 { |
| 87 | + return (.utf16LittleEndian, 2) |
| 88 | + } |
| 89 | + if data[0] == utf16BEBOM.0 && data[1] == utf16BEBOM.1 { |
| 90 | + return (.utf16BigEndian, 2) |
| 91 | + } |
| 92 | + |
| 93 | + return (nil, 0) |
| 94 | + } |
| 95 | + |
| 96 | + /// Checks if an encoding uses ASCII-compatible byte values for structural characters. |
| 97 | + /// |
| 98 | + /// ASCII-compatible encodings use the same byte values (0x00-0x7F) for ASCII characters, |
| 99 | + /// which means CSV structural characters (comma, quote, CR, LF) have identical byte representations. |
| 100 | + /// This allows the parser to operate on raw bytes and only use encoding for string conversion. |
| 101 | + /// |
| 102 | + /// - Parameter encoding: The encoding to check. |
| 103 | + /// - Returns: `true` if the encoding is ASCII-compatible. |
| 104 | + @inline(__always) |
| 105 | + static func isASCIICompatible(_ encoding: String.Encoding) -> Bool { |
| 106 | + switch encoding { |
| 107 | + case .utf8, .ascii, .isoLatin1, .isoLatin2, |
| 108 | + .windowsCP1250, .windowsCP1251, .windowsCP1252, |
| 109 | + .windowsCP1253, .windowsCP1254, |
| 110 | + .macOSRoman, .nextstep: |
| 111 | + return true |
| 112 | + case .utf16, .utf16BigEndian, .utf16LittleEndian, |
| 113 | + .utf32, .utf32BigEndian, .utf32LittleEndian, |
| 114 | + .unicode: |
| 115 | + return false |
| 116 | + default: |
| 117 | + // For unknown encodings, assume not ASCII-compatible for safety |
| 118 | + return false |
| 119 | + } |
| 120 | + } |
| 121 | + |
| 122 | + /// Transcodes data from a non-ASCII-compatible encoding to UTF-8. |
| 123 | + /// |
| 124 | + /// For encodings like UTF-16 and UTF-32, the byte structure differs from ASCII, |
| 125 | + /// so we must convert to String first, then to UTF-8 bytes for parsing. |
| 126 | + /// |
| 127 | + /// - Parameters: |
| 128 | + /// - data: The source data. |
| 129 | + /// - encoding: The source encoding. |
| 130 | + /// - Returns: UTF-8 encoded data, or nil if conversion fails. |
| 131 | + static func transcodeToUTF8(_ data: Data, from encoding: String.Encoding) -> Data? { |
| 132 | + // Try to detect and skip BOM |
| 133 | + let (detectedEncoding, bomOffset) = detectBOM(in: data) |
| 134 | + let effectiveEncoding = detectedEncoding ?? encoding |
| 135 | + let dataWithoutBOM = bomOffset > 0 ? data.dropFirst(bomOffset) : data |
| 136 | + |
| 137 | + guard let string = String(data: Data(dataWithoutBOM), encoding: effectiveEncoding) else { |
| 138 | + return nil |
| 139 | + } |
| 140 | + return string.data(using: .utf8) |
| 141 | + } |
47 | 142 | } |
48 | 143 |
|
49 | 144 | // MARK: - Field Escaping |
|
0 commit comments