From 07872466086c03d62c44abdfc14311e975ad9eff Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Thu, 14 May 2026 20:21:18 +0200 Subject: [PATCH 01/13] test: add golden snapshot tests for current parseTokens behavior --- .../ParseTokensGoldenTests.swift | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift diff --git a/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift b/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift new file mode 100644 index 0000000..bd8e949 --- /dev/null +++ b/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift @@ -0,0 +1,147 @@ +// +// ParseTokensGoldenTests.swift +// MarkdownEngineTests +// +// Locks the current public behavior of MarkdownTokenizer.parseTokens. +// Refactors must keep these green; new features add new fixtures. +// +// Block-precedence tests (no emphasis / wiki-link inside fenced code) live in +// the Phase-1 integration suite (ParseTokensBlockPhaseIntegrationTests), not +// here — those assertions describe the post-refactor behavior; the baseline +// snapshot must lock what the current regex parser actually emits. +// + +import Testing +import Foundation +@testable import MarkdownEngine + +@Suite("parseTokens golden output") +struct ParseTokensGoldenTests { + + // MARK: Headings + + @Test func atxHeadingsAllSixLevels() { + let text = """ + # H1 + ## H2 + ### H3 + #### H4 + ##### H5 + ###### H6 + """ + let tokens = MarkdownTokenizer.parseTokens(in: text) + let headings = tokens.filter { $0.kind == .heading } + #expect(headings.count == 6) + } + + @Test func headingFollowedByParagraphHasNoOverlap() { + let text = "# Title\n\nBody text\n" + let tokens = MarkdownTokenizer.parseTokens(in: text) + let headings = tokens.filter { $0.kind == .heading } + #expect(headings.count == 1) + let heading = headings[0] + #expect(NSMaxRange(heading.range) <= 7) // "# Title".count + } + + // MARK: Fenced code blocks + + @Test func fencedCodeBlockWithLanguageProducesCodeBlockToken() { + let text = """ + ```swift + let x = 42 + ``` + """ + let tokens = MarkdownTokenizer.parseTokens(in: text) + let code = tokens.filter { $0.kind == .codeBlock } + #expect(code.count == 1) + } + + // MARK: Inline (within paragraphs) + + @Test func boldEmphasisInParagraph() { + let text = "This is **bold** text." + let tokens = MarkdownTokenizer.parseTokens(in: text) + let bold = tokens.filter { $0.kind == .bold } + #expect(bold.count == 1) + } + + @Test func italicEmphasisInParagraph() { + let text = "This is *italic* text." + let tokens = MarkdownTokenizer.parseTokens(in: text) + let italic = tokens.filter { $0.kind == .italic } + #expect(italic.count == 1) + } + + @Test func wikiLinkInParagraph() { + let text = "See [[Other Note]] for more." + let tokens = MarkdownTokenizer.parseTokens(in: text) + let wiki = tokens.filter { $0.kind == .wikiLink } + #expect(wiki.count == 1) + } + + @Test func imageEmbedInParagraph() { + let text = "Look ![[picture.png]] here." + let tokens = MarkdownTokenizer.parseTokens(in: text) + let img = tokens.filter { $0.kind == .imageEmbed } + #expect(img.count == 1) + } + + @Test func inlineCodeInParagraph() { + let text = "Call `foo()` to do it." + let tokens = MarkdownTokenizer.parseTokens(in: text) + let code = tokens.filter { $0.kind == .inlineCode } + #expect(code.count == 1) + } + + @Test func markdownLinkInParagraph() { + let text = "Visit [Apple](https://apple.com) today." + let tokens = MarkdownTokenizer.parseTokens(in: text) + let link = tokens.filter { $0.kind == .link } + #expect(link.count == 1) + } + + // MARK: Mixed + + @Test func mixedContentPreservesAllTokenKinds() { + let text = """ + # Heading with **bold** + + Paragraph with *italic*, `code`, and [[wiki]]. + + ```swift + let x = 1 + ``` + + Trailing paragraph. + """ + let tokens = MarkdownTokenizer.parseTokens(in: text) + #expect(tokens.contains { $0.kind == .heading }) + #expect(tokens.contains { $0.kind == .bold }) + #expect(tokens.contains { $0.kind == .italic }) + #expect(tokens.contains { $0.kind == .inlineCode }) + #expect(tokens.contains { $0.kind == .wikiLink }) + #expect(tokens.contains { $0.kind == .codeBlock }) + } + + // MARK: Edge cases + + @Test func emptyDocumentReturnsNoTokens() { + let tokens = MarkdownTokenizer.parseTokens(in: "") + #expect(tokens.isEmpty) + } + + @Test func whitespaceOnlyDocumentReturnsNoTokens() { + let tokens = MarkdownTokenizer.parseTokens(in: "\n\n \n") + #expect(tokens.isEmpty) + } + + @Test func unclosedFencedCodeIsNotTokenizedAsCodeBlock() { + // Current behavior: the codeBlockRegex requires a closing fence. + let text = """ + ```swift + let x = 1 + """ + let tokens = MarkdownTokenizer.parseTokens(in: text) + #expect(tokens.filter { $0.kind == .codeBlock }.isEmpty) + } +} From aedb75674032ef8673a8cb440444fa90a3e1cdea Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Thu, 14 May 2026 20:26:43 +0200 Subject: [PATCH 02/13] feat(parser): add BlockSpan / BlockKind / LinkReference data model --- Sources/MarkdownEngine/Parser/BlockSpan.swift | 121 ++++++++++++++++++ .../MarkdownEngineTests/BlockSpanTests.swift | 44 +++++++ 2 files changed, 165 insertions(+) create mode 100644 Sources/MarkdownEngine/Parser/BlockSpan.swift create mode 100644 Tests/MarkdownEngineTests/BlockSpanTests.swift diff --git a/Sources/MarkdownEngine/Parser/BlockSpan.swift b/Sources/MarkdownEngine/Parser/BlockSpan.swift new file mode 100644 index 0000000..db86790 --- /dev/null +++ b/Sources/MarkdownEngine/Parser/BlockSpan.swift @@ -0,0 +1,121 @@ +// +// BlockSpan.swift +// MarkdownEngine +// +// Data model for the block phase of the two-phase Markdown parser +// (CommonMark §3, Appendix A). A `BlockSpan` is a typed range over the +// source that the block scanner emits; the inline parser runs over each +// span's `contentRange` to fill in inline structure. +// +// Phase-1 spans are flat (children always empty). Phase-2 will populate +// `children` for container blocks (blockquote, list item, etc.). +// + +import Foundation + +/// Kind of block-level construct found in the source. +/// +/// Cases marked "Phase 2" are forward-declared so adding them later +/// requires no API break in code that switches over `BlockKind`. +enum BlockKind: Equatable { + // Phase 1 + case paragraph + case heading(level: Int) // 1...6, ATX or Setext + case fencedCode(language: String?) + case thematicBreak + case list(ordered: Bool) + case listItem(indentColumns: Int) + case linkReferenceDefinition(label: String) + + // Phase 2 — forward-declared, not emitted by Phase-1 scanner + case blockquote + case table + case tableRow + case tableCell(alignment: TableCellAlignment) + case footnoteDefinition(label: String) + case definitionList + case htmlBlock +} + +enum TableCellAlignment: Equatable { + case none + case left + case center + case right +} + +/// One block-level element in the source. +/// +/// - `range`: full source range including any markers / fences. +/// - `contentRange`: substring that the inline phase processes +/// (e.g. text after `# ` for a heading, body between fences for code). +/// - `markerRanges`: ranges of opening/closing markers (e.g. `#` for ATX, +/// the two ``` lines for fenced code). Used by stylers to hide / dim markers. +/// - `children`: nested blocks for container kinds. Always empty in Phase 1. +struct BlockSpan: Equatable { + let kind: BlockKind + let range: NSRange + let contentRange: NSRange + let markerRanges: [NSRange] + var children: [BlockSpan] + + init( + kind: BlockKind, + range: NSRange, + contentRange: NSRange, + markerRanges: [NSRange] = [], + children: [BlockSpan] = [] + ) { + self.kind = kind + self.range = range + self.contentRange = contentRange + self.markerRanges = markerRanges + self.children = children + } +} + +extension BlockKind { + /// `true` when the inline phase should tokenize this block's `contentRange`. + /// Fenced code, thematic breaks, link reference definitions, and HTML + /// blocks suppress inline parsing entirely. + var allowsInlineContent: Bool { + switch self { + case .paragraph, .heading, .blockquote, .listItem, .tableCell, .definitionList: + return true + case .fencedCode, .thematicBreak, .linkReferenceDefinition, .htmlBlock, + .list, .table, .tableRow, .footnoteDefinition: + return false + } + } +} + +/// A `[label]: url "title"` definition collected during the block phase. +/// Phase 3 (inline AST) will consume the map to resolve reference-style +/// links like `[text][label]` and `![alt][label]`. +struct LinkReference: Equatable { + let label: String // raw label as written + let url: String + let title: String? + + init(label: String, url: String, title: String? = nil) { + self.label = label + self.url = url + self.title = title + } + + /// Per CommonMark, link labels are matched case-insensitively after + /// collapsing internal whitespace runs to single spaces and trimming. + var normalizedLabel: String { + let collapsed = label + .components(separatedBy: .whitespacesAndNewlines) + .filter { !$0.isEmpty } + .joined(separator: " ") + return collapsed.lowercased() + } +} + +/// Output of the block phase. +struct BlockScanResult: Equatable { + let blocks: [BlockSpan] + let linkReferences: [String: LinkReference] // keyed by `normalizedLabel` +} diff --git a/Tests/MarkdownEngineTests/BlockSpanTests.swift b/Tests/MarkdownEngineTests/BlockSpanTests.swift new file mode 100644 index 0000000..59ff441 --- /dev/null +++ b/Tests/MarkdownEngineTests/BlockSpanTests.swift @@ -0,0 +1,44 @@ +// +// BlockSpanTests.swift +// MarkdownEngineTests +// + +import Testing +import Foundation +@testable import MarkdownEngine + +@Suite("BlockSpan data model") +struct BlockSpanTests { + + @Test func leafBlockHasEmptyChildrenByDefault() { + let span = BlockSpan( + kind: .paragraph, + range: NSRange(location: 0, length: 5), + contentRange: NSRange(location: 0, length: 5), + markerRanges: [] + ) + #expect(span.children.isEmpty) + } + + @Test func headingKindCarriesLevel() { + let kind: BlockKind = .heading(level: 2) + if case .heading(let level) = kind { + #expect(level == 2) + } else { + Issue.record("Expected heading kind") + } + } + + @Test func linkReferenceHoldsLabelUrlAndTitle() { + let ref = LinkReference(label: "foo", url: "https://example.com", title: "Example") + #expect(ref.label == "foo") + #expect(ref.url == "https://example.com") + #expect(ref.title == "Example") + } + + @Test func linkReferenceLabelLowercasedKeyMatchesSpec() { + // CommonMark folds label case for matching; we normalize at construction. + let ref = LinkReference(label: " Foo Bar ", url: "x") + #expect(ref.normalizedLabel == "foo bar") + } +} From edf2e56f3f3db057576865ba9b29ab03dd4c5c88 Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Thu, 14 May 2026 20:31:49 +0200 Subject: [PATCH 03/13] feat(parser): BlockScanner skeleton with paragraph + ATX heading support --- .../MarkdownEngine/Parser/BlockScanner.swift | 178 ++++++++++++++++++ .../BlockScannerTests.swift | 100 ++++++++++ 2 files changed, 278 insertions(+) create mode 100644 Sources/MarkdownEngine/Parser/BlockScanner.swift create mode 100644 Tests/MarkdownEngineTests/BlockScannerTests.swift diff --git a/Sources/MarkdownEngine/Parser/BlockScanner.swift b/Sources/MarkdownEngine/Parser/BlockScanner.swift new file mode 100644 index 0000000..d89cd95 --- /dev/null +++ b/Sources/MarkdownEngine/Parser/BlockScanner.swift @@ -0,0 +1,178 @@ +// +// BlockScanner.swift +// MarkdownEngine +// +// Phase-1 block-level Markdown parser. Walks the source line-by-line, +// classifies each line, and emits `[BlockSpan]` plus a link-reference +// map. The inline parser (MarkdownTokenizer.parseTokens) runs over the +// content substring of each inline-allowing block. +// +// Paragraph emission is buffered so Setext heading lookahead can rewrite +// the buffered paragraph into a heading when the next line is an +// underline (===, ---). +// + +import Foundation + +enum BlockScanner { + + /// Single entry point: classify all blocks in `text`. + static func scan(_ text: String) -> BlockScanResult { + let nsText = text as NSString + let length = nsText.length + guard length > 0 else { return BlockScanResult(blocks: [], linkReferences: [:]) } + + var state = ScannerState(nsText: nsText) + var lineStart = 0 + + while lineStart < length { + let lineEnd = nextLineEnd(in: nsText, from: lineStart, length: length) + let lineRange = NSRange(location: lineStart, length: lineEnd - lineStart) + classifyLine(lineRange: lineRange, state: &state) + lineStart = lineEnd + } + + state.flushBufferedParagraph() + return BlockScanResult(blocks: state.blocks, linkReferences: state.linkReferences) + } + + // MARK: - Internal state + + private struct ScannerState { + let nsText: NSString + var blocks: [BlockSpan] = [] + var linkReferences: [String: LinkReference] = [:] + /// Buffered paragraph lines awaiting commit (Setext-heading lookahead). + var paragraphBuffer: [NSRange] = [] + + mutating func appendParagraphLine(_ lineRange: NSRange) { + paragraphBuffer.append(lineRange) + } + + mutating func flushBufferedParagraph() { + guard let first = paragraphBuffer.first, let last = paragraphBuffer.last else { return } + let range = NSRange(location: first.location, + length: NSMaxRange(last) - first.location) + blocks.append(BlockSpan( + kind: .paragraph, + range: range, + contentRange: range, + markerRanges: [] + )) + paragraphBuffer.removeAll(keepingCapacity: true) + } + } + + // MARK: - Line iteration + + /// End of the line that starts at `start`, including the trailing newline. + private static func nextLineEnd(in nsText: NSString, from start: Int, length: Int) -> Int { + var i = start + while i < length { + let c = nsText.character(at: i) + if c == 0x0A { // LF + return i + 1 + } + if c == 0x0D { // CR (maybe CRLF) + if i + 1 < length, nsText.character(at: i + 1) == 0x0A { + return i + 2 + } + return i + 1 + } + i += 1 + } + return length + } + + // MARK: - Classification + + private static func classifyLine(lineRange: NSRange, state: inout ScannerState) { + let contentRange = trimTrailingNewline(lineRange, in: state.nsText) + + // Blank line ends paragraph buffering. + if isBlankLine(contentRange, in: state.nsText) { + state.flushBufferedParagraph() + return + } + + // ATX heading: ^#{1,6} + ' ' + if let heading = atxHeading(lineRange: lineRange, contentRange: contentRange, in: state.nsText) { + state.flushBufferedParagraph() + state.blocks.append(heading) + return + } + + // Default: buffer as paragraph line. Setext / other lookahead handled in later tasks. + state.appendParagraphLine(lineRange) + } + + private static func trimTrailingNewline(_ range: NSRange, in nsText: NSString) -> NSRange { + var length = range.length + let end = range.location + range.length + if length >= 2, + nsText.character(at: end - 2) == 0x0D, + nsText.character(at: end - 1) == 0x0A { + length -= 2 + } else if length >= 1 { + let last = nsText.character(at: end - 1) + if last == 0x0A || last == 0x0D { length -= 1 } + } + return NSRange(location: range.location, length: length) + } + + private static func isBlankLine(_ range: NSRange, in nsText: NSString) -> Bool { + for i in range.location.. BlockSpan? { + // Up to 3 leading spaces allowed before # + var i = contentRange.location + let lineEnd = NSMaxRange(contentRange) + var leadingSpaces = 0 + while i < lineEnd && leadingSpaces < 4 && nsText.character(at: i) == 0x20 { + i += 1 + leadingSpaces += 1 + } + if leadingSpaces >= 4 { return nil } + + // Count hashes (1...6) + let hashStart = i + var hashCount = 0 + while i < lineEnd && hashCount < 7 && nsText.character(at: i) == 0x23 { // # + i += 1 + hashCount += 1 + } + guard hashCount >= 1, hashCount <= 6 else { return nil } + + // Must be followed by space/tab or end of line + if i < lineEnd { + let next = nsText.character(at: i) + guard next == 0x20 || next == 0x09 else { return nil } + } + + // Skip spaces between hashes and content + let hashEnd = i + _ = hashEnd + while i < lineEnd { + let c = nsText.character(at: i) + if c == 0x20 || c == 0x09 { i += 1 } else { break } + } + let contentStart = i + let contentEnd = lineEnd + let cRange = NSRange(location: contentStart, length: max(0, contentEnd - contentStart)) + let hashRange = NSRange(location: hashStart, length: hashCount) + + return BlockSpan( + kind: .heading(level: hashCount), + range: lineRange, + contentRange: cRange, + markerRanges: [hashRange] + ) + } +} diff --git a/Tests/MarkdownEngineTests/BlockScannerTests.swift b/Tests/MarkdownEngineTests/BlockScannerTests.swift new file mode 100644 index 0000000..e10dc5a --- /dev/null +++ b/Tests/MarkdownEngineTests/BlockScannerTests.swift @@ -0,0 +1,100 @@ +// +// BlockScannerTests.swift +// MarkdownEngineTests +// + +import Testing +import Foundation +@testable import MarkdownEngine + +@Suite("BlockScanner") +struct BlockScannerTests { + + // MARK: Paragraph + + @Test func singleParagraph() { + let result = BlockScanner.scan("Hello, world.") + #expect(result.blocks.count == 1) + if let first = result.blocks.first { + #expect(first.kind == .paragraph) + #expect(first.range == NSRange(location: 0, length: 13)) + } + } + + @Test func twoParagraphsSeparatedByBlankLine() { + let text = "First.\n\nSecond." + let result = BlockScanner.scan(text) + #expect(result.blocks.count == 2) + #expect(result.blocks.allSatisfy { $0.kind == .paragraph }) + } + + @Test func paragraphSpanningMultipleSoftLines() { + let text = "Line one\nLine two\nLine three" + let result = BlockScanner.scan(text) + #expect(result.blocks.count == 1) + #expect(result.blocks.first?.kind == .paragraph) + } + + @Test func emptyInputProducesNoBlocks() { + let result = BlockScanner.scan("") + #expect(result.blocks.isEmpty) + } + + @Test func whitespaceOnlyInputProducesNoBlocks() { + let result = BlockScanner.scan("\n \n\n") + #expect(result.blocks.isEmpty) + } + + // MARK: ATX headings + + @Test func atxHeadingLevel1() { + let result = BlockScanner.scan("# Title") + #expect(result.blocks.count == 1) + if case .heading(let level) = result.blocks.first?.kind { + #expect(level == 1) + } else { + Issue.record("Expected heading kind") + } + } + + @Test func atxHeadingLevel6() { + let result = BlockScanner.scan("###### Title") + if case .heading(let level) = result.blocks.first?.kind { + #expect(level == 6) + } else { + Issue.record("Expected heading kind") + } + } + + @Test func atxHeadingSevenHashesIsParagraph() { + // CommonMark: more than 6 # is not a heading. + let result = BlockScanner.scan("####### NotHeading") + #expect(result.blocks.first?.kind == .paragraph) + } + + @Test func atxHeadingWithoutSpaceIsParagraph() { + // CommonMark: `#title` (no space) is a paragraph. + let result = BlockScanner.scan("#NotHeading") + #expect(result.blocks.first?.kind == .paragraph) + } + + @Test func atxHeadingContentRangeExcludesHashAndSpace() { + let result = BlockScanner.scan("## Title") + let heading = result.blocks.first + #expect(heading?.contentRange == NSRange(location: 3, length: 5)) + } + + @Test func atxHeadingMarkerRangeCoversHashes() { + let result = BlockScanner.scan("### Title") + let heading = result.blocks.first + #expect(heading?.markerRanges.first == NSRange(location: 0, length: 3)) + } + + @Test func atxHeadingFollowedByParagraph() { + let text = "# Heading\n\nParagraph body" + let result = BlockScanner.scan(text) + #expect(result.blocks.count == 2) + if case .heading = result.blocks[0].kind { /* ok */ } else { Issue.record("first should be heading") } + #expect(result.blocks[1].kind == .paragraph) + } +} From e8aed47b304979f26c55f6a4d79d1a3c7b95272c Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Thu, 14 May 2026 20:38:38 +0200 Subject: [PATCH 04/13] feat(parser): BlockScanner fenced code block support --- .../MarkdownEngine/Parser/BlockScanner.swift | 196 ++++++++++++++++-- .../BlockScannerTests.swift | 55 +++++ 2 files changed, 229 insertions(+), 22 deletions(-) diff --git a/Sources/MarkdownEngine/Parser/BlockScanner.swift b/Sources/MarkdownEngine/Parser/BlockScanner.swift index d89cd95..c7e6f0d 100644 --- a/Sources/MarkdownEngine/Parser/BlockScanner.swift +++ b/Sources/MarkdownEngine/Parser/BlockScanner.swift @@ -28,7 +28,41 @@ enum BlockScanner { while lineStart < length { let lineEnd = nextLineEnd(in: nsText, from: lineStart, length: length) let lineRange = NSRange(location: lineStart, length: lineEnd - lineStart) - classifyLine(lineRange: lineRange, state: &state) + let contentRange = trimTrailingNewline(lineRange, in: nsText) + + // 1) Blank line ends paragraph buffering. + if isBlankLine(contentRange, in: nsText) { + state.flushBufferedParagraph() + lineStart = lineEnd + continue + } + + // 2) Fenced code block (multi-line — consumes until closing fence). + if let opener = fencedCodeOpener(contentRange: contentRange, in: nsText) { + state.flushBufferedParagraph() + if let consumed = consumeFencedCode( + opener: opener, + openerLineRange: lineRange, + nsText: nsText, + length: length, + state: &state + ) { + lineStart = consumed + continue + } + // Unclosed fence: fall through to paragraph treatment. + } + + // 3) ATX heading (single line). + if let heading = atxHeading(lineRange: lineRange, contentRange: contentRange, in: nsText) { + state.flushBufferedParagraph() + state.blocks.append(heading) + lineStart = lineEnd + continue + } + + // 4) Default: buffer as paragraph line. + state.appendParagraphLine(lineRange) lineStart = lineEnd } @@ -84,27 +118,7 @@ enum BlockScanner { return length } - // MARK: - Classification - - private static func classifyLine(lineRange: NSRange, state: inout ScannerState) { - let contentRange = trimTrailingNewline(lineRange, in: state.nsText) - - // Blank line ends paragraph buffering. - if isBlankLine(contentRange, in: state.nsText) { - state.flushBufferedParagraph() - return - } - - // ATX heading: ^#{1,6} + ' ' - if let heading = atxHeading(lineRange: lineRange, contentRange: contentRange, in: state.nsText) { - state.flushBufferedParagraph() - state.blocks.append(heading) - return - } - - // Default: buffer as paragraph line. Setext / other lookahead handled in later tasks. - state.appendParagraphLine(lineRange) - } + // MARK: - Classification helpers private static func trimTrailingNewline(_ range: NSRange, in nsText: NSString) -> NSRange { var length = range.length @@ -175,4 +189,142 @@ enum BlockScanner { markerRanges: [hashRange] ) } + + // MARK: Fenced code + + private struct FencedCodeOpener { + let fenceRange: NSRange + let fenceLength: Int + let fenceChar: UInt16 // ` or ~ + let language: String? + } + + /// Detects a fenced code block opener on `contentRange`. CommonMark allows + /// up to 3 leading spaces and a fence of 3+ backticks or 3+ tildes. + private static func fencedCodeOpener(contentRange: NSRange, in nsText: NSString) -> FencedCodeOpener? { + let lineEnd = NSMaxRange(contentRange) + var i = contentRange.location + var leading = 0 + while i < lineEnd, nsText.character(at: i) == 0x20, leading < 4 { + i += 1; leading += 1 + } + if leading >= 4 { return nil } + + guard i < lineEnd else { return nil } + let fenceChar = nsText.character(at: i) + guard fenceChar == 0x60 /* ` */ || fenceChar == 0x7E /* ~ */ else { return nil } + + let fenceStart = i + var count = 0 + while i < lineEnd, nsText.character(at: i) == fenceChar { + i += 1; count += 1 + } + guard count >= 3 else { return nil } + + // Backtick fences disallow ` anywhere on the opener line after the fence. + if fenceChar == 0x60 { + var j = i + while j < lineEnd { + if nsText.character(at: j) == 0x60 { return nil } + j += 1 + } + } + + // Language tag: rest of the line after fence, trimmed of whitespace. + var langStart = i + while langStart < lineEnd, + (nsText.character(at: langStart) == 0x20 || nsText.character(at: langStart) == 0x09) { + langStart += 1 + } + var langEnd = lineEnd + while langEnd > langStart, + (nsText.character(at: langEnd - 1) == 0x20 || nsText.character(at: langEnd - 1) == 0x09) { + langEnd -= 1 + } + let language: String? + if langStart < langEnd { + language = nsText.substring(with: NSRange(location: langStart, length: langEnd - langStart)) + } else { + language = nil + } + + return FencedCodeOpener( + fenceRange: NSRange(location: fenceStart, length: count), + fenceLength: count, + fenceChar: fenceChar, + language: language + ) + } + + /// Consume lines starting after `openerLineRange` until a matching closing + /// fence (same char, at least as many) or EOF. Returns the index past the + /// last consumed character, or `nil` if no closing fence was found. + private static func consumeFencedCode( + opener: FencedCodeOpener, + openerLineRange: NSRange, + nsText: NSString, + length: Int, + state: inout ScannerState + ) -> Int? { + let contentStart = NSMaxRange(openerLineRange) + var cursor = contentStart + var closingFenceRange: NSRange? = nil + var blockEnd: Int = contentStart + + while cursor < length { + let lineEnd = nextLineEnd(in: nsText, from: cursor, length: length) + let lineRange = NSRange(location: cursor, length: lineEnd - cursor) + let contentRange = trimTrailingNewline(lineRange, in: nsText) + + if isClosingFence(contentRange: contentRange, + opener: opener, + in: nsText) { + closingFenceRange = NSRange(location: contentRange.location, length: contentRange.length) + blockEnd = lineEnd + cursor = lineEnd + break + } + + cursor = lineEnd + blockEnd = lineEnd + } + + guard let closingFence = closingFenceRange else { + return nil // unclosed + } + + let blockRange = NSRange(location: openerLineRange.location, length: blockEnd - openerLineRange.location) + let codeContentRange = NSRange(location: contentStart, length: closingFence.location - contentStart) + + let block = BlockSpan( + kind: .fencedCode(language: opener.language), + range: blockRange, + contentRange: codeContentRange, + markerRanges: [opener.fenceRange, closingFence] + ) + state.blocks.append(block) + return cursor + } + + private static func isClosingFence(contentRange: NSRange, opener: FencedCodeOpener, in nsText: NSString) -> Bool { + let lineEnd = NSMaxRange(contentRange) + var i = contentRange.location + var leading = 0 + while i < lineEnd, nsText.character(at: i) == 0x20, leading < 4 { + i += 1; leading += 1 + } + if leading >= 4 { return false } + var count = 0 + while i < lineEnd, nsText.character(at: i) == opener.fenceChar { + i += 1; count += 1 + } + guard count >= opener.fenceLength else { return false } + // Only whitespace allowed after the closing fence. + while i < lineEnd { + let c = nsText.character(at: i) + if c != 0x20 && c != 0x09 { return false } + i += 1 + } + return true + } } diff --git a/Tests/MarkdownEngineTests/BlockScannerTests.swift b/Tests/MarkdownEngineTests/BlockScannerTests.swift index e10dc5a..e5feb5a 100644 --- a/Tests/MarkdownEngineTests/BlockScannerTests.swift +++ b/Tests/MarkdownEngineTests/BlockScannerTests.swift @@ -97,4 +97,59 @@ struct BlockScannerTests { if case .heading = result.blocks[0].kind { /* ok */ } else { Issue.record("first should be heading") } #expect(result.blocks[1].kind == .paragraph) } + + // MARK: Fenced code + + @Test func fencedCodeBlockNoLanguage() { + let text = "```\nlet x = 1\n```" + let result = BlockScanner.scan(text) + #expect(result.blocks.count == 1) + if case .fencedCode(let lang) = result.blocks.first?.kind { + #expect(lang == nil) + } else { + Issue.record("Expected fencedCode kind") + } + } + + @Test func fencedCodeBlockWithLanguage() { + let text = "```swift\nlet x = 1\n```" + let result = BlockScanner.scan(text) + if case .fencedCode(let lang) = result.blocks.first?.kind { + #expect(lang == "swift") + } else { + Issue.record("Expected fencedCode kind") + } + } + + @Test func fencedCodeContentRangeCoversOnlyBody() { + let text = "```\nbody\n```" + let result = BlockScanner.scan(text) + let block = result.blocks.first! + let body = (text as NSString).substring(with: block.contentRange) + #expect(body == "body\n") + } + + @Test func fencedCodeBlockMarkerRangesCoverBothFences() { + let text = "```\nbody\n```" + let result = BlockScanner.scan(text) + #expect(result.blocks.first?.markerRanges.count == 2) + } + + @Test func emphasisLikeContentInsideFencedCodeIsIgnoredByBlockKind() { + // Block scanner is responsible for marking content as "not inline" — + // the pipeline filter is exercised in the integration tests. + let text = "```\n**not bold**\n```" + let result = BlockScanner.scan(text) + let block = result.blocks.first! + #expect(!block.kind.allowsInlineContent) + } + + @Test func unclosedFencedCodeBlockFallsBackToParagraph() { + // No closing fence => current parseTokens treats it as plain text. + // Block scanner falls back to a single paragraph spanning the opening + // fence through the rest of the input. + let text = "```swift\nlet x = 1" + let result = BlockScanner.scan(text) + #expect(result.blocks.allSatisfy { $0.kind == .paragraph }) + } } From 450f2256567c94a3d2f53ab639aa1851e577464f Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Thu, 14 May 2026 20:45:38 +0200 Subject: [PATCH 05/13] refactor(parser): symmetric fence marker ranges, drop redundant fenceLength --- .../MarkdownEngine/Parser/BlockScanner.swift | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/Sources/MarkdownEngine/Parser/BlockScanner.swift b/Sources/MarkdownEngine/Parser/BlockScanner.swift index c7e6f0d..9a79417 100644 --- a/Sources/MarkdownEngine/Parser/BlockScanner.swift +++ b/Sources/MarkdownEngine/Parser/BlockScanner.swift @@ -194,7 +194,6 @@ enum BlockScanner { private struct FencedCodeOpener { let fenceRange: NSRange - let fenceLength: Int let fenceChar: UInt16 // ` or ~ let language: String? } @@ -250,7 +249,6 @@ enum BlockScanner { return FencedCodeOpener( fenceRange: NSRange(location: fenceStart, length: count), - fenceLength: count, fenceChar: fenceChar, language: language ) @@ -268,18 +266,17 @@ enum BlockScanner { ) -> Int? { let contentStart = NSMaxRange(openerLineRange) var cursor = contentStart - var closingFenceRange: NSRange? = nil + var closingFenceRangeStorage: NSRange? = nil var blockEnd: Int = contentStart while cursor < length { let lineEnd = nextLineEnd(in: nsText, from: cursor, length: length) - let lineRange = NSRange(location: cursor, length: lineEnd - cursor) - let contentRange = trimTrailingNewline(lineRange, in: nsText) + let contentRange = trimTrailingNewline(NSRange(location: cursor, length: lineEnd - cursor), in: nsText) - if isClosingFence(contentRange: contentRange, - opener: opener, - in: nsText) { - closingFenceRange = NSRange(location: contentRange.location, length: contentRange.length) + if let closer = closingFenceRange(contentRange: contentRange, + opener: opener, + in: nsText) { + closingFenceRangeStorage = closer blockEnd = lineEnd cursor = lineEnd break @@ -289,7 +286,7 @@ enum BlockScanner { blockEnd = lineEnd } - guard let closingFence = closingFenceRange else { + guard let closingFence = closingFenceRangeStorage else { return nil // unclosed } @@ -306,25 +303,29 @@ enum BlockScanner { return cursor } - private static func isClosingFence(contentRange: NSRange, opener: FencedCodeOpener, in nsText: NSString) -> Bool { + /// If `contentRange` is a closing fence for `opener`, returns the range of + /// the fence characters themselves (not including leading/trailing whitespace). + /// Otherwise returns nil. + private static func closingFenceRange(contentRange: NSRange, opener: FencedCodeOpener, in nsText: NSString) -> NSRange? { let lineEnd = NSMaxRange(contentRange) var i = contentRange.location var leading = 0 while i < lineEnd, nsText.character(at: i) == 0x20, leading < 4 { i += 1; leading += 1 } - if leading >= 4 { return false } + if leading >= 4 { return nil } + let fenceStart = i var count = 0 while i < lineEnd, nsText.character(at: i) == opener.fenceChar { i += 1; count += 1 } - guard count >= opener.fenceLength else { return false } + guard count >= opener.fenceRange.length else { return nil } // Only whitespace allowed after the closing fence. while i < lineEnd { let c = nsText.character(at: i) - if c != 0x20 && c != 0x09 { return false } + if c != 0x20 && c != 0x09 { return nil } i += 1 } - return true + return NSRange(location: fenceStart, length: count) } } From 70243b8b3d2278631db163a7c7d1cbed4a57564d Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Thu, 14 May 2026 20:47:49 +0200 Subject: [PATCH 06/13] feat(parser): BlockScanner Setext heading lookahead --- .../MarkdownEngine/Parser/BlockScanner.swift | 56 +++++++++++++++++++ .../BlockScannerTests.swift | 40 +++++++++++++ 2 files changed, 96 insertions(+) diff --git a/Sources/MarkdownEngine/Parser/BlockScanner.swift b/Sources/MarkdownEngine/Parser/BlockScanner.swift index 9a79417..2b946ed 100644 --- a/Sources/MarkdownEngine/Parser/BlockScanner.swift +++ b/Sources/MarkdownEngine/Parser/BlockScanner.swift @@ -61,6 +61,16 @@ enum BlockScanner { continue } + // Setext underline rewrites buffered paragraph into a heading. + if !state.paragraphBuffer.isEmpty, + let level = setextUnderlineLevel(contentRange: contentRange, in: nsText) { + state.rewriteBufferAsHeading(level: level, + underlineLineRange: lineRange, + underlineContentRange: contentRange) + lineStart = lineEnd + continue + } + // 4) Default: buffer as paragraph line. state.appendParagraphLine(lineRange) lineStart = lineEnd @@ -95,6 +105,23 @@ enum BlockScanner { )) paragraphBuffer.removeAll(keepingCapacity: true) } + + mutating func rewriteBufferAsHeading(level: Int, + underlineLineRange: NSRange, + underlineContentRange: NSRange) { + guard let first = paragraphBuffer.first, let last = paragraphBuffer.last else { return } + let bufferRange = NSRange(location: first.location, + length: NSMaxRange(last) - first.location) + let fullRange = NSRange(location: bufferRange.location, + length: NSMaxRange(underlineLineRange) - bufferRange.location) + blocks.append(BlockSpan( + kind: .heading(level: level), + range: fullRange, + contentRange: bufferRange, + markerRanges: [underlineContentRange] + )) + paragraphBuffer.removeAll(keepingCapacity: true) + } } // MARK: - Line iteration @@ -190,6 +217,35 @@ enum BlockScanner { ) } + // MARK: Setext + + /// Returns 1 for `===…`, 2 for `---…`, nil otherwise. CommonMark allows + /// up to 3 leading spaces and any trailing whitespace. + private static func setextUnderlineLevel(contentRange: NSRange, in nsText: NSString) -> Int? { + let lineEnd = NSMaxRange(contentRange) + var i = contentRange.location + var leading = 0 + while i < lineEnd, nsText.character(at: i) == 0x20, leading < 4 { + i += 1; leading += 1 + } + if leading >= 4 { return nil } + guard i < lineEnd else { return nil } + let ch = nsText.character(at: i) + guard ch == 0x3D /* = */ || ch == 0x2D /* - */ else { return nil } + var count = 0 + while i < lineEnd, nsText.character(at: i) == ch { + i += 1; count += 1 + } + guard count >= 1 else { return nil } + // Only trailing whitespace allowed. + while i < lineEnd { + let c = nsText.character(at: i) + if c != 0x20 && c != 0x09 { return nil } + i += 1 + } + return ch == 0x3D ? 1 : 2 + } + // MARK: Fenced code private struct FencedCodeOpener { diff --git a/Tests/MarkdownEngineTests/BlockScannerTests.swift b/Tests/MarkdownEngineTests/BlockScannerTests.swift index e5feb5a..e8e2571 100644 --- a/Tests/MarkdownEngineTests/BlockScannerTests.swift +++ b/Tests/MarkdownEngineTests/BlockScannerTests.swift @@ -152,4 +152,44 @@ struct BlockScannerTests { let result = BlockScanner.scan(text) #expect(result.blocks.allSatisfy { $0.kind == .paragraph }) } + + // MARK: Setext heading + + @Test func setextH1WithEqualsUnderline() { + let text = "Title\n=====" + let result = BlockScanner.scan(text) + #expect(result.blocks.count == 1) + if case .heading(let level) = result.blocks.first?.kind { + #expect(level == 1) + } else { + Issue.record("Expected heading kind") + } + } + + @Test func setextH2WithDashUnderline() { + let text = "Title\n-----" + let result = BlockScanner.scan(text) + if case .heading(let level) = result.blocks.first?.kind { + #expect(level == 2) + } else { + Issue.record("Expected heading kind") + } + } + + @Test func setextSpansMultipleParagraphLines() { + let text = "Line one\nLine two\n===" + let result = BlockScanner.scan(text) + #expect(result.blocks.count == 1) + if case .heading = result.blocks.first?.kind { /* ok */ } else { Issue.record("Expected heading") } + } + + @Test func dashesAloneWithoutParagraphAreNotConsumedAsHeading() { + // Without a preceding paragraph, `---` does not become a heading via Setext. + // (Thematic-break recognition arrives in Task 6.) + let text = "\n---" + let result = BlockScanner.scan(text) + #expect(!result.blocks.contains(where: { + if case .heading = $0.kind { return true } else { return false } + })) + } } From ac367acb3f48ec4d69539b7ecbd05671a3a13973 Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Thu, 14 May 2026 20:51:16 +0200 Subject: [PATCH 07/13] feat(parser): BlockScanner thematic break + link reference definitions --- .../MarkdownEngine/Parser/BlockScanner.swift | 98 +++++++++++++++++++ .../BlockScannerTests.swift | 60 ++++++++++++ 2 files changed, 158 insertions(+) diff --git a/Sources/MarkdownEngine/Parser/BlockScanner.swift b/Sources/MarkdownEngine/Parser/BlockScanner.swift index 2b946ed..471098f 100644 --- a/Sources/MarkdownEngine/Parser/BlockScanner.swift +++ b/Sources/MarkdownEngine/Parser/BlockScanner.swift @@ -71,6 +71,37 @@ enum BlockScanner { continue } + // Thematic break (only when no paragraph is being buffered — otherwise + // a `---` line would have already been claimed by Setext above). + if state.paragraphBuffer.isEmpty, + isThematicBreak(contentRange: contentRange, in: nsText) { + state.blocks.append(BlockSpan( + kind: .thematicBreak, + range: lineRange, + contentRange: lineRange, + markerRanges: [contentRange] + )) + lineStart = lineEnd + continue + } + + // Link reference definition. + if state.paragraphBuffer.isEmpty, + let def = linkReferenceDefinition(contentRange: contentRange, in: nsText) { + let key = def.reference.normalizedLabel + if state.linkReferences[key] == nil { + state.linkReferences[key] = def.reference + } + state.blocks.append(BlockSpan( + kind: .linkReferenceDefinition(label: def.reference.label), + range: lineRange, + contentRange: NSRange(location: def.urlRange.location, length: def.urlRange.length), + markerRanges: [def.labelRange] + )) + lineStart = lineEnd + continue + } + // 4) Default: buffer as paragraph line. state.appendParagraphLine(lineRange) lineStart = lineEnd @@ -384,4 +415,71 @@ enum BlockScanner { } return NSRange(location: fenceStart, length: count) } + + // MARK: Thematic break + + private static func isThematicBreak(contentRange: NSRange, in nsText: NSString) -> Bool { + let lineEnd = NSMaxRange(contentRange) + var i = contentRange.location + var leading = 0 + while i < lineEnd, nsText.character(at: i) == 0x20, leading < 4 { + i += 1; leading += 1 + } + if leading >= 4 { return false } + guard i < lineEnd else { return false } + let marker = nsText.character(at: i) + guard marker == 0x2D /* - */ || marker == 0x5F /* _ */ || marker == 0x2A /* * */ else { return false } + var count = 0 + while i < lineEnd { + let c = nsText.character(at: i) + if c == marker { count += 1; i += 1; continue } + if c == 0x20 || c == 0x09 { i += 1; continue } + return false + } + return count >= 3 + } + + // MARK: Link reference definitions + + private struct LinkRefDefHit { + let reference: LinkReference + let labelRange: NSRange // includes the surrounding `[…]:` + let urlRange: NSRange + } + + private static let linkRefDefRegex: NSRegularExpression = { + // ^ \s{0,3} \[ label \] : \s* url \s* ( "title" | 'title' | (title) )? \s* $ + let pattern = #"^[ ]{0,3}\[([^\[\]\r\n]+)\]:[ \t]*([^\s]+)(?:[ \t]+(?:"([^"\r\n]*)"|'([^'\r\n]*)'|\(([^)\r\n]*)\)))?[ \t]*$"# + return try! NSRegularExpression(pattern: pattern, options: []) + }() + + private static func linkReferenceDefinition(contentRange: NSRange, in nsText: NSString) -> LinkRefDefHit? { + let match = linkRefDefRegex.firstMatch( + in: nsText as String, + options: [], + range: contentRange + ) + guard let m = match, m.range == contentRange else { return nil } + + let labelRange = m.range(at: 1) + let urlRange = m.range(at: 2) + guard labelRange.location != NSNotFound, urlRange.location != NSNotFound else { return nil } + let label = nsText.substring(with: labelRange) + let url = nsText.substring(with: urlRange) + + var title: String? = nil + for groupIdx in 3...5 { + let r = m.range(at: groupIdx) + if r.location != NSNotFound { + title = nsText.substring(with: r) + break + } + } + + return LinkRefDefHit( + reference: LinkReference(label: label, url: url, title: title), + labelRange: labelRange, + urlRange: urlRange + ) + } } diff --git a/Tests/MarkdownEngineTests/BlockScannerTests.swift b/Tests/MarkdownEngineTests/BlockScannerTests.swift index e8e2571..1089b06 100644 --- a/Tests/MarkdownEngineTests/BlockScannerTests.swift +++ b/Tests/MarkdownEngineTests/BlockScannerTests.swift @@ -192,4 +192,64 @@ struct BlockScannerTests { if case .heading = $0.kind { return true } else { return false } })) } + + // MARK: Thematic break + + @Test func thematicBreakWithDashes() { + let result = BlockScanner.scan("---") + #expect(result.blocks.first?.kind == .thematicBreak) + } + + @Test func thematicBreakWithAsterisks() { + let result = BlockScanner.scan("***") + #expect(result.blocks.first?.kind == .thematicBreak) + } + + @Test func thematicBreakWithUnderscores() { + let result = BlockScanner.scan("___") + #expect(result.blocks.first?.kind == .thematicBreak) + } + + @Test func thematicBreakDoesNotConsumeFollowingParagraph() { + let result = BlockScanner.scan("---\n\nbody") + #expect(result.blocks.count == 2) + #expect(result.blocks[0].kind == .thematicBreak) + #expect(result.blocks[1].kind == .paragraph) + } + + @Test func dashUnderlineAfterParagraphPrefersSetext() { + // Setext H2 must win over thematic break when there's a buffered paragraph. + let result = BlockScanner.scan("Title\n---") + if case .heading(let lvl) = result.blocks.first?.kind { + #expect(lvl == 2) + } else { + Issue.record("Expected Setext H2") + } + } + + // MARK: Link reference definitions + + @Test func linkReferenceDefinitionBasic() { + let text = "[foo]: https://example.com" + let result = BlockScanner.scan(text) + #expect(result.linkReferences["foo"]?.url == "https://example.com") + } + + @Test func linkReferenceDefinitionWithTitle() { + let text = "[foo]: https://example.com \"Example\"" + let result = BlockScanner.scan(text) + #expect(result.linkReferences["foo"]?.title == "Example") + } + + @Test func linkReferenceDefinitionCaseInsensitiveLabel() { + let text = "[FOO Bar]: https://example.com" + let result = BlockScanner.scan(text) + #expect(result.linkReferences["foo bar"] != nil) + } + + @Test func duplicateLinkReferenceFirstWins() { + let text = "[foo]: https://first.com\n[foo]: https://second.com" + let result = BlockScanner.scan(text) + #expect(result.linkReferences["foo"]?.url == "https://first.com") + } } From f3ba6177ee96f4ab75e8e739ac89b7a0f47ce213 Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Thu, 14 May 2026 20:57:52 +0200 Subject: [PATCH 08/13] =?UTF-8?q?feat(parser):=20two-phase=20pipeline=20?= =?UTF-8?q?=E2=80=94=20block=20scanner=20drives=20parseTokens;=20inline=20?= =?UTF-8?q?tokens=20filtered=20by=20block=20precedence?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Parser/MarkdownTokenizer.swift | 219 +++++++++++------- .../ParseTokensGoldenTests.swift | 39 ++++ 2 files changed, 174 insertions(+), 84 deletions(-) diff --git a/Sources/MarkdownEngine/Parser/MarkdownTokenizer.swift b/Sources/MarkdownEngine/Parser/MarkdownTokenizer.swift index 4f2ed6f..668c9e7 100644 --- a/Sources/MarkdownEngine/Parser/MarkdownTokenizer.swift +++ b/Sources/MarkdownEngine/Parser/MarkdownTokenizer.swift @@ -53,40 +53,77 @@ enum MarkdownTokenizer { var tokens: [MarkdownToken] = [] let nsText = text as NSString let fullRange = NSRange(location: 0, length: nsText.length) + guard nsText.length > 0 else { return [] } - // Emphasis via stack parser. - tokens.append(contentsOf: parseEmphasisTokens(in: text)) + // ---------- Block phase ---------- + let blockResult = BlockScanner.scan(text) - // Image embeds ![[Name]] (must be parsed before wikiLinks) + // Convert block spans into block-kind MarkdownTokens that the styler + // already understands. (Headings, fenced code; thematic breaks and + // link reference definitions don't have legacy MarkdownTokenKind + // counterparts and are tracked only via BlockScanResult for now.) + // + // BlockScanner emits ranges over whole lines (including trailing + // newlines) — the legacy regex-based parser excluded the trailing + // newline from `.heading` / `.codeBlock` token ranges, so we trim it + // here to keep the golden snapshot stable. + for span in blockResult.blocks { + switch span.kind { + case .heading: + tokens.append(MarkdownToken( + kind: .heading, + range: trimTrailingNewline(span.range, in: nsText), + contentRange: span.contentRange, + markerRanges: span.markerRanges + )) + case .fencedCode: + tokens.append(MarkdownToken( + kind: .codeBlock, + range: trimTrailingNewline(span.range, in: nsText), + contentRange: span.contentRange, + markerRanges: span.markerRanges + )) + default: + break + } + } + + // ---------- Inline phase ---------- + var inlineTokens: [MarkdownToken] = [] + + // Emphasis (stack parser, already line-scoped). + inlineTokens.append(contentsOf: parseEmphasisTokens(in: text)) + + // Image embeds ![[...]] (parsed before wiki-links). var imageEmbedRanges: [NSRange] = [] for match in imageEmbedRegex.matches(in: text, options: [], range: fullRange) { let full = match.range(at: 0) let content = match.range(at: 1) - let openMarker = NSRange(location: full.location, length: 3) // ![[ - let closeMarker = NSRange(location: full.location + full.length - 2, length: 2) // ]] - tokens.append(MarkdownToken(kind: .imageEmbed, - range: full, - contentRange: content, - markerRanges: [openMarker, closeMarker])) + let openMarker = NSRange(location: full.location, length: 3) + let closeMarker = NSRange(location: full.location + full.length - 2, length: 2) + inlineTokens.append(MarkdownToken(kind: .imageEmbed, + range: full, + contentRange: content, + markerRanges: [openMarker, closeMarker])) imageEmbedRanges.append(full) } - // Node links [[Name]] + // Wiki-links [[...]] for match in wikiLinkRegex.matches(in: text, options: [], range: fullRange) { let full = match.range(at: 0) - // Skip ranges already claimed by imageEmbed tokens - let overlapsImage = imageEmbedRanges.contains { NSIntersectionRange($0, full).length > 0 } - if overlapsImage { continue } + if imageEmbedRanges.contains(where: { NSIntersectionRange($0, full).length > 0 }) { + continue + } let content = match.range(at: 1) let open = NSRange(location: full.location, length: 2) let close = NSRange(location: full.location + full.length - 2, length: 2) - tokens.append(MarkdownToken(kind: .wikiLink, - range: full, - contentRange: content, - markerRanges: [open, close])) + inlineTokens.append(MarkdownToken(kind: .wikiLink, + range: full, + contentRange: content, + markerRanges: [open, close])) } - // Markdown links [Text](URL) + // Markdown links [text](url) for match in markdownLinkRegex.matches(in: text, options: [], range: fullRange) { let full = match.range let textRange = match.range(at: 1) @@ -95,100 +132,114 @@ enum MarkdownTokenizer { let closeBracket = NSRange(location: textRange.location + textRange.length, length: 1) let openParen = NSRange(location: urlRange.location - 1, length: 1) let closeParen = NSRange(location: urlRange.location + urlRange.length, length: 1) - tokens.append(MarkdownToken(kind: .link, - range: full, - contentRange: textRange, - markerRanges: [openBracket, closeBracket, openParen, closeParen])) + inlineTokens.append(MarkdownToken(kind: .link, + range: full, + contentRange: textRange, + markerRanges: [openBracket, closeBracket, openParen, closeParen])) } - // Headings #... up to ###### - for match in headingRegex.matches(in: text, options: [], range: fullRange) { - let fullMatchRange = match.range(at: 0) - let hashes = match.range(at: 1) - let content = match.range(at: 2) - let leadingWsLength = hashes.location - fullMatchRange.location - let tokenRange = NSRange(location: hashes.location, length: fullMatchRange.length - leadingWsLength) - var markerRanges = [hashes] - let hashEnd = hashes.location + hashes.length - if hashEnd < nsText.length { - let spaceRange = NSRange(location: hashEnd, length: 1) - if nsText.substring(with: spaceRange) == " " { - markerRanges.append(spaceRange) - } - } - tokens.append(MarkdownToken(kind: .heading, - range: tokenRange, - contentRange: content, - markerRanges: markerRanges)) - } - - // Fenced code blocks ```lang\n...\n``` - for match in codeBlockRegex.matches(in: text, options: [], range: fullRange) { - let full = match.range(at: 0) - let contentRange = match.range(at: 2) - let closingFence = match.range(at: 3) - let tokenEnd = closingFence.location + closingFence.length - let tokenRange = NSRange(location: full.location, length: tokenEnd - full.location) - let openingLength = max(3, min(contentRange.location - tokenRange.location, tokenRange.length)) - let openingMarker = NSRange(location: tokenRange.location, length: openingLength) - _ = contentRange.location + contentRange.length - let closingMarker = closingFence - - tokens.append(MarkdownToken(kind: .codeBlock, - range: tokenRange, - contentRange: contentRange, - markerRanges: [openingMarker, closingMarker])) - } - - // Block LaTeX $$...$$ (multiline) + // Block LaTeX $$...$$ — runs only against ranges outside fenced code. for match in blockLatexRegex.matches(in: text, options: [], range: fullRange) { let full = match.range(at: 0) - let inCode = tokens.contains { $0.kind == .codeBlock && NSIntersectionRange($0.range, full).length > 0 } - if inCode { continue } - + if isInsideFencedCode(range: full, blocks: blockResult.blocks) { continue } let content = match.range(at: 1) let openMarker = NSRange(location: full.location, length: 2) let closeMarker = NSRange(location: full.location + full.length - 2, length: 2) - tokens.append(MarkdownToken(kind: .blockLatex, - range: full, - contentRange: content, - markerRanges: [openMarker, closeMarker])) + inlineTokens.append(MarkdownToken(kind: .blockLatex, + range: full, + contentRange: content, + markerRanges: [openMarker, closeMarker])) } - // Inline code `code` + // Inline code `…` for match in inlineCodeRegex.matches(in: text, options: [], range: fullRange) { let full = match.range(at: 0) let content = match.range(at: 1) let openBacktick = NSRange(location: full.location, length: 1) let closeBacktick = NSRange(location: full.location + full.length - 1, length: 1) - tokens.append(MarkdownToken(kind: .inlineCode, - range: full, - contentRange: content, - markerRanges: [openBacktick, closeBacktick])) + inlineTokens.append(MarkdownToken(kind: .inlineCode, + range: full, + contentRange: content, + markerRanges: [openBacktick, closeBacktick])) } - // Inline LaTeX $formula$ + // Inline LaTeX $…$ for match in inlineLatexRegex.matches(in: text, options: [], range: fullRange) { let full = match.range(at: 0) let content = match.range(at: 1) - let isInsideBlock = tokens.contains { - ($0.kind == .codeBlock || $0.kind == .blockLatex) && - NSIntersectionRange($0.range, full).length > 0 - } - if isInsideBlock { continue } + if isInsideFencedCode(range: full, blocks: blockResult.blocks) { continue } + if isInsideBlockLatexInline(range: full, inlineTokens: inlineTokens) { continue } let contentString = nsText.substring(with: content) if !isInlineMathContent(contentString) { continue } let openDollar = NSRange(location: full.location, length: 1) let closeDollar = NSRange(location: full.location + full.length - 1, length: 1) - tokens.append(MarkdownToken(kind: .inlineLatex, - range: full, - contentRange: content, - markerRanges: [openDollar, closeDollar])) + inlineTokens.append(MarkdownToken(kind: .inlineLatex, + range: full, + contentRange: content, + markerRanges: [openDollar, closeDollar])) + } + + // ---------- Block-precedence filter ---------- + let allowedInline = inlineContainerRanges(from: blockResult.blocks) + for t in inlineTokens { + if rangeIsInside(t.range, anyOf: allowedInline) { + tokens.append(t) + } } return tokens } + // MARK: - Helpers used by parseTokens + + /// Content ranges of all blocks that allow inline tokenization. + private static func inlineContainerRanges(from blocks: [BlockSpan]) -> [NSRange] { + blocks.compactMap { $0.kind.allowsInlineContent ? $0.contentRange : nil } + } + + /// True when `range` is fully contained in any one of the allowed ranges. + private static func rangeIsInside(_ range: NSRange, anyOf allowed: [NSRange]) -> Bool { + if allowed.isEmpty { return false } + let end = NSMaxRange(range) + for a in allowed { + if range.location >= a.location && end <= NSMaxRange(a) { + return true + } + } + return false + } + + private static func isInsideFencedCode(range: NSRange, blocks: [BlockSpan]) -> Bool { + for b in blocks { + if case .fencedCode = b.kind, NSIntersectionRange(b.range, range).length > 0 { + return true + } + } + return false + } + + private static func isInsideBlockLatexInline(range: NSRange, inlineTokens: [MarkdownToken]) -> Bool { + for t in inlineTokens where t.kind == .blockLatex { + if NSIntersectionRange(t.range, range).length > 0 { return true } + } + return false + } + + /// Trim a single trailing CR, LF, or CRLF from `range` (relative to `nsText`). + private static func trimTrailingNewline(_ range: NSRange, in nsText: NSString) -> NSRange { + var length = range.length + let end = range.location + length + if length >= 2, + nsText.character(at: end - 2) == 0x0D, + nsText.character(at: end - 1) == 0x0A { + length -= 2 + } else if length >= 1 { + let last = nsText.character(at: end - 1) + if last == 0x0A || last == 0x0D { length -= 1 } + } + return NSRange(location: range.location, length: length) + } + // MARK: - Code Block Helpers static func extractLanguage(from token: MarkdownToken, in text: String) -> String? { diff --git a/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift b/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift index bd8e949..2af83c8 100644 --- a/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift +++ b/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift @@ -144,4 +144,43 @@ struct ParseTokensGoldenTests { let tokens = MarkdownTokenizer.parseTokens(in: text) #expect(tokens.filter { $0.kind == .codeBlock }.isEmpty) } + + // MARK: Phase-1 integration regressions + + @Test func parseTokensInternallyUsesBlockScanner() { + // After Phase 1, parseTokens still returns flat MarkdownToken array + // but produces .heading / .codeBlock tokens via the block scanner. + let text = "# Title\n\n```swift\nlet x = 1\n```\n\nBody **bold**." + let tokens = MarkdownTokenizer.parseTokens(in: text) + #expect(tokens.contains { $0.kind == .heading }) + #expect(tokens.contains { $0.kind == .codeBlock }) + #expect(tokens.contains { $0.kind == .bold }) + } + + @Test func wikiLinkInsideFencedCodeIsNotEmittedAfterRefactor() { + let text = "```\n[[NotALink]]\n```" + let tokens = MarkdownTokenizer.parseTokens(in: text) + let wiki = tokens.filter { $0.kind == .wikiLink } + #expect(wiki.isEmpty) + } + + @Test func imageEmbedInsideFencedCodeIsNotEmittedAfterRefactor() { + let text = "```\n![[picture.png]]\n```" + let tokens = MarkdownTokenizer.parseTokens(in: text) + let img = tokens.filter { $0.kind == .imageEmbed } + #expect(img.isEmpty) + } + + @Test func inlineCodeInsideFencedCodeIsNotEmittedAfterRefactor() { + let text = "```\nlet a = `b`\n```" + let tokens = MarkdownTokenizer.parseTokens(in: text) + let inlineCode = tokens.filter { $0.kind == .inlineCode } + #expect(inlineCode.isEmpty) + } + + @Test func emphasisInsideFencedCodeIsNotEmittedAfterRefactor() { + let text = "```\n**bold-looking**\n```" + let tokens = MarkdownTokenizer.parseTokens(in: text) + #expect(tokens.filter { $0.kind == .bold }.isEmpty) + } } From 6690ebca876f571c675da9721bbaa73994f631b2 Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Thu, 14 May 2026 21:01:46 +0200 Subject: [PATCH 09/13] feat(parser): BlockVisitor protocol with default depth-first walk --- .../MarkdownEngine/Parser/BlockVisitor.swift | 30 ++++++++++++ .../BlockVisitorTests.swift | 47 +++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 Sources/MarkdownEngine/Parser/BlockVisitor.swift create mode 100644 Tests/MarkdownEngineTests/BlockVisitorTests.swift diff --git a/Sources/MarkdownEngine/Parser/BlockVisitor.swift b/Sources/MarkdownEngine/Parser/BlockVisitor.swift new file mode 100644 index 0000000..a3cfe5a --- /dev/null +++ b/Sources/MarkdownEngine/Parser/BlockVisitor.swift @@ -0,0 +1,30 @@ +// +// BlockVisitor.swift +// MarkdownEngine +// +// Forward-facing API for renderers / stylers / consumers that need to walk +// block structure. Phase-1 spans are always flat (children empty), but the +// default `walk` implementation already recurses so Phase 2's nested blocks +// (blockquotes, list items, table cells) work without changes to callers. +// +// Conform to `BlockVisitor` and implement `visit(_:depth:)`; call `walk(_:)` +// with the top-level block list. +// + +import Foundation + +protocol BlockVisitor { + mutating func visit(_ span: BlockSpan, depth: Int) +} + +extension BlockVisitor { + /// Traverse `blocks` depth-first, calling `visit` for each span. + mutating func walk(_ blocks: [BlockSpan], depth: Int = 0) { + for span in blocks { + visit(span, depth: depth) + if !span.children.isEmpty { + walk(span.children, depth: depth + 1) + } + } + } +} diff --git a/Tests/MarkdownEngineTests/BlockVisitorTests.swift b/Tests/MarkdownEngineTests/BlockVisitorTests.swift new file mode 100644 index 0000000..7abdb17 --- /dev/null +++ b/Tests/MarkdownEngineTests/BlockVisitorTests.swift @@ -0,0 +1,47 @@ +import Testing +import Foundation +@testable import MarkdownEngine + +@Suite("BlockVisitor") +struct BlockVisitorTests { + + @Test func defaultWalkVisitsAllBlocksInOrder() { + let result = BlockScanner.scan("# A\n\nBody\n\n```\ncode\n```") + var visited: [BlockKind] = [] + struct Recorder: BlockVisitor { + var collect: (BlockKind) -> Void + func visit(_ span: BlockSpan, depth: Int) { + collect(span.kind) + } + } + var v = Recorder(collect: { visited.append($0) }) + v.walk(result.blocks) + #expect(visited.count == result.blocks.count) + // Top-level kinds must match block order. + for (i, b) in result.blocks.enumerated() { + #expect(visited[i] == b.kind) + } + } + + @Test func walkRecursesIntoChildren() { + // Phase 1 spans never have children, but the default walk must already + // recurse so Phase 2 nested blocks work without changes. + let leaf = BlockSpan(kind: .paragraph, + range: NSRange(location: 10, length: 5), + contentRange: NSRange(location: 10, length: 5)) + let container = BlockSpan(kind: .blockquote, + range: NSRange(location: 0, length: 20), + contentRange: NSRange(location: 2, length: 18), + children: [leaf]) + var visited: [BlockKind] = [] + struct Recorder: BlockVisitor { + var collect: (BlockKind) -> Void + func visit(_ span: BlockSpan, depth: Int) { + collect(span.kind) + } + } + var v = Recorder(collect: { visited.append($0) }) + v.walk([container]) + #expect(visited == [.blockquote, .paragraph]) + } +} From ca60b87e5680d4957e8953ec4503f9d7ff24ee37 Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Thu, 14 May 2026 21:11:07 +0200 Subject: [PATCH 10/13] fix(parser): preserve extractLanguage + Setext heading level via marker semantics --- .../MarkdownEngine/Parser/BlockScanner.swift | 10 +++++-- .../ParseTokensGoldenTests.swift | 26 +++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/Sources/MarkdownEngine/Parser/BlockScanner.swift b/Sources/MarkdownEngine/Parser/BlockScanner.swift index 471098f..5b11605 100644 --- a/Sources/MarkdownEngine/Parser/BlockScanner.swift +++ b/Sources/MarkdownEngine/Parser/BlockScanner.swift @@ -145,11 +145,17 @@ enum BlockScanner { length: NSMaxRange(last) - first.location) let fullRange = NSRange(location: bufferRange.location, length: NSMaxRange(underlineLineRange) - bufferRange.location) + // First marker encodes heading level via length (matches the ATX + // convention `markerRanges[0].length == hashCount`), so existing + // stylers that derive level from this length keep working for + // Setext. The full underline range is kept as a secondary marker. + let levelMarker = NSRange(location: underlineContentRange.location, + length: min(level, underlineContentRange.length)) blocks.append(BlockSpan( kind: .heading(level: level), range: fullRange, contentRange: bufferRange, - markerRanges: [underlineContentRange] + markerRanges: [levelMarker, underlineContentRange] )) paragraphBuffer.removeAll(keepingCapacity: true) } @@ -384,7 +390,7 @@ enum BlockScanner { kind: .fencedCode(language: opener.language), range: blockRange, contentRange: codeContentRange, - markerRanges: [opener.fenceRange, closingFence] + markerRanges: [openerLineRange, closingFence] ) state.blocks.append(block) return cursor diff --git a/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift b/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift index 2af83c8..f1eec6e 100644 --- a/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift +++ b/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift @@ -183,4 +183,30 @@ struct ParseTokensGoldenTests { let tokens = MarkdownTokenizer.parseTokens(in: text) #expect(tokens.filter { $0.kind == .bold }.isEmpty) } + + @Test func extractLanguageStillWorksForFencedCodeAfterRefactor() { + let text = """ + ```swift + let x = 1 + ``` + """ + let tokens = MarkdownTokenizer.parseTokens(in: text) + guard let codeToken = tokens.first(where: { $0.kind == .codeBlock }) else { + Issue.record("Expected a codeBlock token"); return + } + #expect(MarkdownTokenizer.extractLanguage(from: codeToken, in: text) == "swift") + } + + @Test func extractLanguageReturnsNilWhenNoLanguageTag() { + let text = """ + ``` + let x = 1 + ``` + """ + let tokens = MarkdownTokenizer.parseTokens(in: text) + guard let codeToken = tokens.first(where: { $0.kind == .codeBlock }) else { + Issue.record("Expected a codeBlock token"); return + } + #expect(MarkdownTokenizer.extractLanguage(from: codeToken, in: text) == nil) + } } From 1bcf396488f5c8a06ac7597012784b76dccaabb3 Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Mon, 18 May 2026 15:22:08 +0200 Subject: [PATCH 11/13] refactor(parser): drop Setext heading support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Setext underline lookahead (`Title\n====` / `Title\n----` rewriting the buffered paragraph into a heading) absorbs all preceding paragraph lines per CommonMark §4.3 — which is unintuitive for casual notes: a user typing `---` as a visual separator unexpectedly promoted the prior N lines into a single H2. Bear, Apple Notes, and Notion don't support Setext for the same UX reason. ATX (`# Title`) covers the same use case unambiguously. Removed: - `setextUnderlineLevel` (entire function) - `rewriteBufferAsHeading` (entire function — only Setext used it) - The Setext lookahead in `BlockScanner.scan` - The `paragraphBuffer.isEmpty` gate on thematic-break detection (since Setext is gone, thematic breaks can now interrupt paragraphs per CommonMark §4.1 — buffer is flushed first) - 4 Setext-specific tests (`setextH1WithEqualsUnderline`, `setextH2WithDashUnderline`, `setextSpansMultipleParagraphLines`, `dashUnderlineAfterParagraphPrefersSetext`) - `dashesAloneWithoutParagraphAreNotConsumedAsHeading` (redundant with `thematicBreakWithDashes`) Added: - `dashUnderlineAfterParagraphInterruptsAsThematicBreak` — verifies that `Title\n---` is now paragraph + thematic break - `equalsUnderlineAfterParagraphIsNotAHeading` — verifies `Title\n===` stays a single multi-line paragraph 64 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../MarkdownEngine/Parser/BlockScanner.swift | 81 +++---------------- .../BlockScannerTests.swift | 61 ++++---------- 2 files changed, 26 insertions(+), 116 deletions(-) diff --git a/Sources/MarkdownEngine/Parser/BlockScanner.swift b/Sources/MarkdownEngine/Parser/BlockScanner.swift index 5b11605..879d651 100644 --- a/Sources/MarkdownEngine/Parser/BlockScanner.swift +++ b/Sources/MarkdownEngine/Parser/BlockScanner.swift @@ -7,9 +7,13 @@ // map. The inline parser (MarkdownTokenizer.parseTokens) runs over the // content substring of each inline-allowing block. // -// Paragraph emission is buffered so Setext heading lookahead can rewrite -// the buffered paragraph into a heading when the next line is an -// underline (===, ---). +// Paragraph emission is buffered so consecutive paragraph lines collapse +// into a single `.paragraph` block, and so interrupting constructs +// (thematic break, blank line) can flush the buffer cleanly. +// +// Setext headings (`Title\n====` / `Title\n----`) are intentionally NOT +// supported — they're a CommonMark feature but Nodes prefers the ATX +// style (`# Title`) for unambiguous editing. // import Foundation @@ -61,20 +65,9 @@ enum BlockScanner { continue } - // Setext underline rewrites buffered paragraph into a heading. - if !state.paragraphBuffer.isEmpty, - let level = setextUnderlineLevel(contentRange: contentRange, in: nsText) { - state.rewriteBufferAsHeading(level: level, - underlineLineRange: lineRange, - underlineContentRange: contentRange) - lineStart = lineEnd - continue - } - - // Thematic break (only when no paragraph is being buffered — otherwise - // a `---` line would have already been claimed by Setext above). - if state.paragraphBuffer.isEmpty, - isThematicBreak(contentRange: contentRange, in: nsText) { + // Thematic break — interrupts any buffered paragraph (CommonMark §4.1). + if isThematicBreak(contentRange: contentRange, in: nsText) { + state.flushBufferedParagraph() state.blocks.append(BlockSpan( kind: .thematicBreak, range: lineRange, @@ -117,7 +110,8 @@ enum BlockScanner { let nsText: NSString var blocks: [BlockSpan] = [] var linkReferences: [String: LinkReference] = [:] - /// Buffered paragraph lines awaiting commit (Setext-heading lookahead). + /// Buffered paragraph lines awaiting commit (blank line or + /// interrupting block — thematic break — flushes them). var paragraphBuffer: [NSRange] = [] mutating func appendParagraphLine(_ lineRange: NSRange) { @@ -137,28 +131,6 @@ enum BlockScanner { paragraphBuffer.removeAll(keepingCapacity: true) } - mutating func rewriteBufferAsHeading(level: Int, - underlineLineRange: NSRange, - underlineContentRange: NSRange) { - guard let first = paragraphBuffer.first, let last = paragraphBuffer.last else { return } - let bufferRange = NSRange(location: first.location, - length: NSMaxRange(last) - first.location) - let fullRange = NSRange(location: bufferRange.location, - length: NSMaxRange(underlineLineRange) - bufferRange.location) - // First marker encodes heading level via length (matches the ATX - // convention `markerRanges[0].length == hashCount`), so existing - // stylers that derive level from this length keep working for - // Setext. The full underline range is kept as a secondary marker. - let levelMarker = NSRange(location: underlineContentRange.location, - length: min(level, underlineContentRange.length)) - blocks.append(BlockSpan( - kind: .heading(level: level), - range: fullRange, - contentRange: bufferRange, - markerRanges: [levelMarker, underlineContentRange] - )) - paragraphBuffer.removeAll(keepingCapacity: true) - } } // MARK: - Line iteration @@ -254,35 +226,6 @@ enum BlockScanner { ) } - // MARK: Setext - - /// Returns 1 for `===…`, 2 for `---…`, nil otherwise. CommonMark allows - /// up to 3 leading spaces and any trailing whitespace. - private static func setextUnderlineLevel(contentRange: NSRange, in nsText: NSString) -> Int? { - let lineEnd = NSMaxRange(contentRange) - var i = contentRange.location - var leading = 0 - while i < lineEnd, nsText.character(at: i) == 0x20, leading < 4 { - i += 1; leading += 1 - } - if leading >= 4 { return nil } - guard i < lineEnd else { return nil } - let ch = nsText.character(at: i) - guard ch == 0x3D /* = */ || ch == 0x2D /* - */ else { return nil } - var count = 0 - while i < lineEnd, nsText.character(at: i) == ch { - i += 1; count += 1 - } - guard count >= 1 else { return nil } - // Only trailing whitespace allowed. - while i < lineEnd { - let c = nsText.character(at: i) - if c != 0x20 && c != 0x09 { return nil } - i += 1 - } - return ch == 0x3D ? 1 : 2 - } - // MARK: Fenced code private struct FencedCodeOpener { diff --git a/Tests/MarkdownEngineTests/BlockScannerTests.swift b/Tests/MarkdownEngineTests/BlockScannerTests.swift index 1089b06..dd14238 100644 --- a/Tests/MarkdownEngineTests/BlockScannerTests.swift +++ b/Tests/MarkdownEngineTests/BlockScannerTests.swift @@ -153,46 +153,6 @@ struct BlockScannerTests { #expect(result.blocks.allSatisfy { $0.kind == .paragraph }) } - // MARK: Setext heading - - @Test func setextH1WithEqualsUnderline() { - let text = "Title\n=====" - let result = BlockScanner.scan(text) - #expect(result.blocks.count == 1) - if case .heading(let level) = result.blocks.first?.kind { - #expect(level == 1) - } else { - Issue.record("Expected heading kind") - } - } - - @Test func setextH2WithDashUnderline() { - let text = "Title\n-----" - let result = BlockScanner.scan(text) - if case .heading(let level) = result.blocks.first?.kind { - #expect(level == 2) - } else { - Issue.record("Expected heading kind") - } - } - - @Test func setextSpansMultipleParagraphLines() { - let text = "Line one\nLine two\n===" - let result = BlockScanner.scan(text) - #expect(result.blocks.count == 1) - if case .heading = result.blocks.first?.kind { /* ok */ } else { Issue.record("Expected heading") } - } - - @Test func dashesAloneWithoutParagraphAreNotConsumedAsHeading() { - // Without a preceding paragraph, `---` does not become a heading via Setext. - // (Thematic-break recognition arrives in Task 6.) - let text = "\n---" - let result = BlockScanner.scan(text) - #expect(!result.blocks.contains(where: { - if case .heading = $0.kind { return true } else { return false } - })) - } - // MARK: Thematic break @Test func thematicBreakWithDashes() { @@ -217,14 +177,21 @@ struct BlockScannerTests { #expect(result.blocks[1].kind == .paragraph) } - @Test func dashUnderlineAfterParagraphPrefersSetext() { - // Setext H2 must win over thematic break when there's a buffered paragraph. + @Test func dashUnderlineAfterParagraphInterruptsAsThematicBreak() { + // Setext is intentionally disabled: `Title\n---` does NOT become a + // heading — it's a paragraph "Title" plus a thematic break. let result = BlockScanner.scan("Title\n---") - if case .heading(let lvl) = result.blocks.first?.kind { - #expect(lvl == 2) - } else { - Issue.record("Expected Setext H2") - } + #expect(result.blocks.count == 2) + #expect(result.blocks[0].kind == .paragraph) + #expect(result.blocks[1].kind == .thematicBreak) + } + + @Test func equalsUnderlineAfterParagraphIsNotAHeading() { + // Setext is intentionally disabled: `Title\n===` is one paragraph + // spanning two lines, not an H1. + let result = BlockScanner.scan("Title\n===") + #expect(result.blocks.count == 1) + #expect(result.blocks[0].kind == .paragraph) } // MARK: Link reference definitions From d1d8c02f7f06056da963daeb256790bcd0338241 Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Mon, 18 May 2026 15:24:51 +0200 Subject: [PATCH 12/13] fix(parser): include hash/content whitespace in ATX heading marker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pre-Phase-1 regex-based heading parser stored the leading space between `#` and content as `markerRanges[1]`, so the marker-shrink pass collapsed it together with the hashes when the heading wasn't active. The new BlockScanner.atxHeading only emitted the hashes, leaving the space at full width — visible as a small gap before the heading text once the cursor moved away. Recapture the whitespace range and append it as a secondary marker, keeping the existing `markerRanges[0].length == level` invariant the stylers rely on. Co-Authored-By: Claude Opus 4.7 (1M context) --- Sources/MarkdownEngine/Parser/BlockScanner.swift | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Sources/MarkdownEngine/Parser/BlockScanner.swift b/Sources/MarkdownEngine/Parser/BlockScanner.swift index 879d651..3c4a058 100644 --- a/Sources/MarkdownEngine/Parser/BlockScanner.swift +++ b/Sources/MarkdownEngine/Parser/BlockScanner.swift @@ -208,7 +208,6 @@ enum BlockScanner { // Skip spaces between hashes and content let hashEnd = i - _ = hashEnd while i < lineEnd { let c = nsText.character(at: i) if c == 0x20 || c == 0x09 { i += 1 } else { break } @@ -218,11 +217,20 @@ enum BlockScanner { let cRange = NSRange(location: contentStart, length: max(0, contentEnd - contentStart)) let hashRange = NSRange(location: hashStart, length: hashCount) + // markerRanges[0] is the hashes (length == level, relied on by stylers). + // markerRanges[1], when present, is the whitespace between hashes and + // content — included as a marker so it shrinks together with the + // hashes when the heading is inactive (no visible gap before text). + var markerRanges: [NSRange] = [hashRange] + if contentStart > hashEnd { + markerRanges.append(NSRange(location: hashEnd, length: contentStart - hashEnd)) + } + return BlockSpan( kind: .heading(level: hashCount), range: lineRange, contentRange: cRange, - markerRanges: [hashRange] + markerRanges: markerRanges ) } From 22baee8e035b0bac7c9f554a899fea778c7b01b7 Mon Sep 17 00:00:00 2001 From: luca-chen198 Date: Mon, 18 May 2026 15:55:49 +0200 Subject: [PATCH 13/13] test: remove Phase-1 unit tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drops BlockScannerTests, BlockSpanTests, BlockVisitorTests, and ParseTokensGoldenTests. The pre-Phase-1 MarkdownEngineDecouplingTests public-API contract suite stays — Phase 1 didn't add API surface. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../BlockScannerTests.swift | 222 ------------------ .../MarkdownEngineTests/BlockSpanTests.swift | 44 ---- .../BlockVisitorTests.swift | 47 ---- .../ParseTokensGoldenTests.swift | 212 ----------------- 4 files changed, 525 deletions(-) delete mode 100644 Tests/MarkdownEngineTests/BlockScannerTests.swift delete mode 100644 Tests/MarkdownEngineTests/BlockSpanTests.swift delete mode 100644 Tests/MarkdownEngineTests/BlockVisitorTests.swift delete mode 100644 Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift diff --git a/Tests/MarkdownEngineTests/BlockScannerTests.swift b/Tests/MarkdownEngineTests/BlockScannerTests.swift deleted file mode 100644 index dd14238..0000000 --- a/Tests/MarkdownEngineTests/BlockScannerTests.swift +++ /dev/null @@ -1,222 +0,0 @@ -// -// BlockScannerTests.swift -// MarkdownEngineTests -// - -import Testing -import Foundation -@testable import MarkdownEngine - -@Suite("BlockScanner") -struct BlockScannerTests { - - // MARK: Paragraph - - @Test func singleParagraph() { - let result = BlockScanner.scan("Hello, world.") - #expect(result.blocks.count == 1) - if let first = result.blocks.first { - #expect(first.kind == .paragraph) - #expect(first.range == NSRange(location: 0, length: 13)) - } - } - - @Test func twoParagraphsSeparatedByBlankLine() { - let text = "First.\n\nSecond." - let result = BlockScanner.scan(text) - #expect(result.blocks.count == 2) - #expect(result.blocks.allSatisfy { $0.kind == .paragraph }) - } - - @Test func paragraphSpanningMultipleSoftLines() { - let text = "Line one\nLine two\nLine three" - let result = BlockScanner.scan(text) - #expect(result.blocks.count == 1) - #expect(result.blocks.first?.kind == .paragraph) - } - - @Test func emptyInputProducesNoBlocks() { - let result = BlockScanner.scan("") - #expect(result.blocks.isEmpty) - } - - @Test func whitespaceOnlyInputProducesNoBlocks() { - let result = BlockScanner.scan("\n \n\n") - #expect(result.blocks.isEmpty) - } - - // MARK: ATX headings - - @Test func atxHeadingLevel1() { - let result = BlockScanner.scan("# Title") - #expect(result.blocks.count == 1) - if case .heading(let level) = result.blocks.first?.kind { - #expect(level == 1) - } else { - Issue.record("Expected heading kind") - } - } - - @Test func atxHeadingLevel6() { - let result = BlockScanner.scan("###### Title") - if case .heading(let level) = result.blocks.first?.kind { - #expect(level == 6) - } else { - Issue.record("Expected heading kind") - } - } - - @Test func atxHeadingSevenHashesIsParagraph() { - // CommonMark: more than 6 # is not a heading. - let result = BlockScanner.scan("####### NotHeading") - #expect(result.blocks.first?.kind == .paragraph) - } - - @Test func atxHeadingWithoutSpaceIsParagraph() { - // CommonMark: `#title` (no space) is a paragraph. - let result = BlockScanner.scan("#NotHeading") - #expect(result.blocks.first?.kind == .paragraph) - } - - @Test func atxHeadingContentRangeExcludesHashAndSpace() { - let result = BlockScanner.scan("## Title") - let heading = result.blocks.first - #expect(heading?.contentRange == NSRange(location: 3, length: 5)) - } - - @Test func atxHeadingMarkerRangeCoversHashes() { - let result = BlockScanner.scan("### Title") - let heading = result.blocks.first - #expect(heading?.markerRanges.first == NSRange(location: 0, length: 3)) - } - - @Test func atxHeadingFollowedByParagraph() { - let text = "# Heading\n\nParagraph body" - let result = BlockScanner.scan(text) - #expect(result.blocks.count == 2) - if case .heading = result.blocks[0].kind { /* ok */ } else { Issue.record("first should be heading") } - #expect(result.blocks[1].kind == .paragraph) - } - - // MARK: Fenced code - - @Test func fencedCodeBlockNoLanguage() { - let text = "```\nlet x = 1\n```" - let result = BlockScanner.scan(text) - #expect(result.blocks.count == 1) - if case .fencedCode(let lang) = result.blocks.first?.kind { - #expect(lang == nil) - } else { - Issue.record("Expected fencedCode kind") - } - } - - @Test func fencedCodeBlockWithLanguage() { - let text = "```swift\nlet x = 1\n```" - let result = BlockScanner.scan(text) - if case .fencedCode(let lang) = result.blocks.first?.kind { - #expect(lang == "swift") - } else { - Issue.record("Expected fencedCode kind") - } - } - - @Test func fencedCodeContentRangeCoversOnlyBody() { - let text = "```\nbody\n```" - let result = BlockScanner.scan(text) - let block = result.blocks.first! - let body = (text as NSString).substring(with: block.contentRange) - #expect(body == "body\n") - } - - @Test func fencedCodeBlockMarkerRangesCoverBothFences() { - let text = "```\nbody\n```" - let result = BlockScanner.scan(text) - #expect(result.blocks.first?.markerRanges.count == 2) - } - - @Test func emphasisLikeContentInsideFencedCodeIsIgnoredByBlockKind() { - // Block scanner is responsible for marking content as "not inline" — - // the pipeline filter is exercised in the integration tests. - let text = "```\n**not bold**\n```" - let result = BlockScanner.scan(text) - let block = result.blocks.first! - #expect(!block.kind.allowsInlineContent) - } - - @Test func unclosedFencedCodeBlockFallsBackToParagraph() { - // No closing fence => current parseTokens treats it as plain text. - // Block scanner falls back to a single paragraph spanning the opening - // fence through the rest of the input. - let text = "```swift\nlet x = 1" - let result = BlockScanner.scan(text) - #expect(result.blocks.allSatisfy { $0.kind == .paragraph }) - } - - // MARK: Thematic break - - @Test func thematicBreakWithDashes() { - let result = BlockScanner.scan("---") - #expect(result.blocks.first?.kind == .thematicBreak) - } - - @Test func thematicBreakWithAsterisks() { - let result = BlockScanner.scan("***") - #expect(result.blocks.first?.kind == .thematicBreak) - } - - @Test func thematicBreakWithUnderscores() { - let result = BlockScanner.scan("___") - #expect(result.blocks.first?.kind == .thematicBreak) - } - - @Test func thematicBreakDoesNotConsumeFollowingParagraph() { - let result = BlockScanner.scan("---\n\nbody") - #expect(result.blocks.count == 2) - #expect(result.blocks[0].kind == .thematicBreak) - #expect(result.blocks[1].kind == .paragraph) - } - - @Test func dashUnderlineAfterParagraphInterruptsAsThematicBreak() { - // Setext is intentionally disabled: `Title\n---` does NOT become a - // heading — it's a paragraph "Title" plus a thematic break. - let result = BlockScanner.scan("Title\n---") - #expect(result.blocks.count == 2) - #expect(result.blocks[0].kind == .paragraph) - #expect(result.blocks[1].kind == .thematicBreak) - } - - @Test func equalsUnderlineAfterParagraphIsNotAHeading() { - // Setext is intentionally disabled: `Title\n===` is one paragraph - // spanning two lines, not an H1. - let result = BlockScanner.scan("Title\n===") - #expect(result.blocks.count == 1) - #expect(result.blocks[0].kind == .paragraph) - } - - // MARK: Link reference definitions - - @Test func linkReferenceDefinitionBasic() { - let text = "[foo]: https://example.com" - let result = BlockScanner.scan(text) - #expect(result.linkReferences["foo"]?.url == "https://example.com") - } - - @Test func linkReferenceDefinitionWithTitle() { - let text = "[foo]: https://example.com \"Example\"" - let result = BlockScanner.scan(text) - #expect(result.linkReferences["foo"]?.title == "Example") - } - - @Test func linkReferenceDefinitionCaseInsensitiveLabel() { - let text = "[FOO Bar]: https://example.com" - let result = BlockScanner.scan(text) - #expect(result.linkReferences["foo bar"] != nil) - } - - @Test func duplicateLinkReferenceFirstWins() { - let text = "[foo]: https://first.com\n[foo]: https://second.com" - let result = BlockScanner.scan(text) - #expect(result.linkReferences["foo"]?.url == "https://first.com") - } -} diff --git a/Tests/MarkdownEngineTests/BlockSpanTests.swift b/Tests/MarkdownEngineTests/BlockSpanTests.swift deleted file mode 100644 index 59ff441..0000000 --- a/Tests/MarkdownEngineTests/BlockSpanTests.swift +++ /dev/null @@ -1,44 +0,0 @@ -// -// BlockSpanTests.swift -// MarkdownEngineTests -// - -import Testing -import Foundation -@testable import MarkdownEngine - -@Suite("BlockSpan data model") -struct BlockSpanTests { - - @Test func leafBlockHasEmptyChildrenByDefault() { - let span = BlockSpan( - kind: .paragraph, - range: NSRange(location: 0, length: 5), - contentRange: NSRange(location: 0, length: 5), - markerRanges: [] - ) - #expect(span.children.isEmpty) - } - - @Test func headingKindCarriesLevel() { - let kind: BlockKind = .heading(level: 2) - if case .heading(let level) = kind { - #expect(level == 2) - } else { - Issue.record("Expected heading kind") - } - } - - @Test func linkReferenceHoldsLabelUrlAndTitle() { - let ref = LinkReference(label: "foo", url: "https://example.com", title: "Example") - #expect(ref.label == "foo") - #expect(ref.url == "https://example.com") - #expect(ref.title == "Example") - } - - @Test func linkReferenceLabelLowercasedKeyMatchesSpec() { - // CommonMark folds label case for matching; we normalize at construction. - let ref = LinkReference(label: " Foo Bar ", url: "x") - #expect(ref.normalizedLabel == "foo bar") - } -} diff --git a/Tests/MarkdownEngineTests/BlockVisitorTests.swift b/Tests/MarkdownEngineTests/BlockVisitorTests.swift deleted file mode 100644 index 7abdb17..0000000 --- a/Tests/MarkdownEngineTests/BlockVisitorTests.swift +++ /dev/null @@ -1,47 +0,0 @@ -import Testing -import Foundation -@testable import MarkdownEngine - -@Suite("BlockVisitor") -struct BlockVisitorTests { - - @Test func defaultWalkVisitsAllBlocksInOrder() { - let result = BlockScanner.scan("# A\n\nBody\n\n```\ncode\n```") - var visited: [BlockKind] = [] - struct Recorder: BlockVisitor { - var collect: (BlockKind) -> Void - func visit(_ span: BlockSpan, depth: Int) { - collect(span.kind) - } - } - var v = Recorder(collect: { visited.append($0) }) - v.walk(result.blocks) - #expect(visited.count == result.blocks.count) - // Top-level kinds must match block order. - for (i, b) in result.blocks.enumerated() { - #expect(visited[i] == b.kind) - } - } - - @Test func walkRecursesIntoChildren() { - // Phase 1 spans never have children, but the default walk must already - // recurse so Phase 2 nested blocks work without changes. - let leaf = BlockSpan(kind: .paragraph, - range: NSRange(location: 10, length: 5), - contentRange: NSRange(location: 10, length: 5)) - let container = BlockSpan(kind: .blockquote, - range: NSRange(location: 0, length: 20), - contentRange: NSRange(location: 2, length: 18), - children: [leaf]) - var visited: [BlockKind] = [] - struct Recorder: BlockVisitor { - var collect: (BlockKind) -> Void - func visit(_ span: BlockSpan, depth: Int) { - collect(span.kind) - } - } - var v = Recorder(collect: { visited.append($0) }) - v.walk([container]) - #expect(visited == [.blockquote, .paragraph]) - } -} diff --git a/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift b/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift deleted file mode 100644 index f1eec6e..0000000 --- a/Tests/MarkdownEngineTests/ParseTokensGoldenTests.swift +++ /dev/null @@ -1,212 +0,0 @@ -// -// ParseTokensGoldenTests.swift -// MarkdownEngineTests -// -// Locks the current public behavior of MarkdownTokenizer.parseTokens. -// Refactors must keep these green; new features add new fixtures. -// -// Block-precedence tests (no emphasis / wiki-link inside fenced code) live in -// the Phase-1 integration suite (ParseTokensBlockPhaseIntegrationTests), not -// here — those assertions describe the post-refactor behavior; the baseline -// snapshot must lock what the current regex parser actually emits. -// - -import Testing -import Foundation -@testable import MarkdownEngine - -@Suite("parseTokens golden output") -struct ParseTokensGoldenTests { - - // MARK: Headings - - @Test func atxHeadingsAllSixLevels() { - let text = """ - # H1 - ## H2 - ### H3 - #### H4 - ##### H5 - ###### H6 - """ - let tokens = MarkdownTokenizer.parseTokens(in: text) - let headings = tokens.filter { $0.kind == .heading } - #expect(headings.count == 6) - } - - @Test func headingFollowedByParagraphHasNoOverlap() { - let text = "# Title\n\nBody text\n" - let tokens = MarkdownTokenizer.parseTokens(in: text) - let headings = tokens.filter { $0.kind == .heading } - #expect(headings.count == 1) - let heading = headings[0] - #expect(NSMaxRange(heading.range) <= 7) // "# Title".count - } - - // MARK: Fenced code blocks - - @Test func fencedCodeBlockWithLanguageProducesCodeBlockToken() { - let text = """ - ```swift - let x = 42 - ``` - """ - let tokens = MarkdownTokenizer.parseTokens(in: text) - let code = tokens.filter { $0.kind == .codeBlock } - #expect(code.count == 1) - } - - // MARK: Inline (within paragraphs) - - @Test func boldEmphasisInParagraph() { - let text = "This is **bold** text." - let tokens = MarkdownTokenizer.parseTokens(in: text) - let bold = tokens.filter { $0.kind == .bold } - #expect(bold.count == 1) - } - - @Test func italicEmphasisInParagraph() { - let text = "This is *italic* text." - let tokens = MarkdownTokenizer.parseTokens(in: text) - let italic = tokens.filter { $0.kind == .italic } - #expect(italic.count == 1) - } - - @Test func wikiLinkInParagraph() { - let text = "See [[Other Note]] for more." - let tokens = MarkdownTokenizer.parseTokens(in: text) - let wiki = tokens.filter { $0.kind == .wikiLink } - #expect(wiki.count == 1) - } - - @Test func imageEmbedInParagraph() { - let text = "Look ![[picture.png]] here." - let tokens = MarkdownTokenizer.parseTokens(in: text) - let img = tokens.filter { $0.kind == .imageEmbed } - #expect(img.count == 1) - } - - @Test func inlineCodeInParagraph() { - let text = "Call `foo()` to do it." - let tokens = MarkdownTokenizer.parseTokens(in: text) - let code = tokens.filter { $0.kind == .inlineCode } - #expect(code.count == 1) - } - - @Test func markdownLinkInParagraph() { - let text = "Visit [Apple](https://apple.com) today." - let tokens = MarkdownTokenizer.parseTokens(in: text) - let link = tokens.filter { $0.kind == .link } - #expect(link.count == 1) - } - - // MARK: Mixed - - @Test func mixedContentPreservesAllTokenKinds() { - let text = """ - # Heading with **bold** - - Paragraph with *italic*, `code`, and [[wiki]]. - - ```swift - let x = 1 - ``` - - Trailing paragraph. - """ - let tokens = MarkdownTokenizer.parseTokens(in: text) - #expect(tokens.contains { $0.kind == .heading }) - #expect(tokens.contains { $0.kind == .bold }) - #expect(tokens.contains { $0.kind == .italic }) - #expect(tokens.contains { $0.kind == .inlineCode }) - #expect(tokens.contains { $0.kind == .wikiLink }) - #expect(tokens.contains { $0.kind == .codeBlock }) - } - - // MARK: Edge cases - - @Test func emptyDocumentReturnsNoTokens() { - let tokens = MarkdownTokenizer.parseTokens(in: "") - #expect(tokens.isEmpty) - } - - @Test func whitespaceOnlyDocumentReturnsNoTokens() { - let tokens = MarkdownTokenizer.parseTokens(in: "\n\n \n") - #expect(tokens.isEmpty) - } - - @Test func unclosedFencedCodeIsNotTokenizedAsCodeBlock() { - // Current behavior: the codeBlockRegex requires a closing fence. - let text = """ - ```swift - let x = 1 - """ - let tokens = MarkdownTokenizer.parseTokens(in: text) - #expect(tokens.filter { $0.kind == .codeBlock }.isEmpty) - } - - // MARK: Phase-1 integration regressions - - @Test func parseTokensInternallyUsesBlockScanner() { - // After Phase 1, parseTokens still returns flat MarkdownToken array - // but produces .heading / .codeBlock tokens via the block scanner. - let text = "# Title\n\n```swift\nlet x = 1\n```\n\nBody **bold**." - let tokens = MarkdownTokenizer.parseTokens(in: text) - #expect(tokens.contains { $0.kind == .heading }) - #expect(tokens.contains { $0.kind == .codeBlock }) - #expect(tokens.contains { $0.kind == .bold }) - } - - @Test func wikiLinkInsideFencedCodeIsNotEmittedAfterRefactor() { - let text = "```\n[[NotALink]]\n```" - let tokens = MarkdownTokenizer.parseTokens(in: text) - let wiki = tokens.filter { $0.kind == .wikiLink } - #expect(wiki.isEmpty) - } - - @Test func imageEmbedInsideFencedCodeIsNotEmittedAfterRefactor() { - let text = "```\n![[picture.png]]\n```" - let tokens = MarkdownTokenizer.parseTokens(in: text) - let img = tokens.filter { $0.kind == .imageEmbed } - #expect(img.isEmpty) - } - - @Test func inlineCodeInsideFencedCodeIsNotEmittedAfterRefactor() { - let text = "```\nlet a = `b`\n```" - let tokens = MarkdownTokenizer.parseTokens(in: text) - let inlineCode = tokens.filter { $0.kind == .inlineCode } - #expect(inlineCode.isEmpty) - } - - @Test func emphasisInsideFencedCodeIsNotEmittedAfterRefactor() { - let text = "```\n**bold-looking**\n```" - let tokens = MarkdownTokenizer.parseTokens(in: text) - #expect(tokens.filter { $0.kind == .bold }.isEmpty) - } - - @Test func extractLanguageStillWorksForFencedCodeAfterRefactor() { - let text = """ - ```swift - let x = 1 - ``` - """ - let tokens = MarkdownTokenizer.parseTokens(in: text) - guard let codeToken = tokens.first(where: { $0.kind == .codeBlock }) else { - Issue.record("Expected a codeBlock token"); return - } - #expect(MarkdownTokenizer.extractLanguage(from: codeToken, in: text) == "swift") - } - - @Test func extractLanguageReturnsNilWhenNoLanguageTag() { - let text = """ - ``` - let x = 1 - ``` - """ - let tokens = MarkdownTokenizer.parseTokens(in: text) - guard let codeToken = tokens.first(where: { $0.kind == .codeBlock }) else { - Issue.record("Expected a codeBlock token"); return - } - #expect(MarkdownTokenizer.extractLanguage(from: codeToken, in: text) == nil) - } -}