diff --git a/src/common/utils/git/diffParser.test.ts b/src/common/utils/git/diffParser.test.ts index 7847e254e4..54c2f4e996 100644 --- a/src/common/utils/git/diffParser.test.ts +++ b/src/common/utils/git/diffParser.test.ts @@ -121,6 +121,171 @@ describe("git diff parser (real repository)", () => { expect(nonPhantomLines.every((l) => l.startsWith("+"))).toBe(true); }); + it("should parse diff headers with non-literal path prefixes", () => { + const diffOutput = [ + "diff --git c/foo.ts w/foo.ts", + "index 1111111..2222222 100644", + "--- c/foo.ts", + "+++ w/foo.ts", + "@@ -1 +1 @@", + "-old line", + "+new line", + ].join("\n"); + + const fileDiffs = parseDiff(diffOutput); + const allHunks = extractAllHunks(fileDiffs); + + expect(fileDiffs).toHaveLength(1); + expect(fileDiffs[0].filePath).toBe("foo.ts"); + expect(fileDiffs[0].oldPath).toBeUndefined(); + expect(allHunks).toHaveLength(1); + expect(allHunks[0].content).toContain("+new line"); + }); + + it("should parse real diffs for paths with spaces", () => { + execSync("git reset --hard HEAD && git clean -fd", { cwd: testRepoPath }); + + const spacedFileName = "file with space.txt"; + writeFileSync(join(testRepoPath, spacedFileName), "before\n"); + execSync("git add . && git commit -m 'Add spaced file'", { cwd: testRepoPath }); + + writeFileSync(join(testRepoPath, spacedFileName), "after\n"); + + const diff = execSync("git diff HEAD", { cwd: testRepoPath, encoding: "utf-8" }); + const fileDiffs = parseDiff(diff); + + expect(fileDiffs).toHaveLength(1); + expect(fileDiffs[0].filePath).toBe(spacedFileName); + expect(fileDiffs[0].oldPath).toBeUndefined(); + expect(fileDiffs[0].hunks).toHaveLength(1); + expect(fileDiffs[0].hunks[0].filePath).toBe(spacedFileName); + expect(fileDiffs[0].hunks[0].content).toContain("+after"); + }); + + it("should preserve literal trailing spaces in diff path labels", () => { + execSync("git reset --hard HEAD && git clean -fd", { cwd: testRepoPath }); + + const trailingSpaceFileName = "trailing-space.txt "; + writeFileSync(join(testRepoPath, trailingSpaceFileName), "before\n"); + execSync("git add . && git commit -m 'Add trailing space file'", { cwd: testRepoPath }); + + rmSync(join(testRepoPath, trailingSpaceFileName)); + + const diff = execSync("git diff HEAD", { cwd: testRepoPath, encoding: "utf-8" }); + const fileDiffs = parseDiff(diff); + + expect(fileDiffs).toHaveLength(1); + expect(fileDiffs[0].changeType).toBe("deleted"); + expect(fileDiffs[0].filePath).toBe(trailingSpaceFileName); + expect(fileDiffs[0].oldPath).toBe(trailingSpaceFileName); + expect(fileDiffs[0].hunks).toHaveLength(1); + expect(fileDiffs[0].hunks[0].filePath).toBe(trailingSpaceFileName); + expect(fileDiffs[0].hunks[0].oldPath).toBe(trailingSpaceFileName); + expect(fileDiffs[0].hunks[0].content).toContain("-before"); + }); + + it("should normalize quoted patch labels for escaped file names", () => { + execSync("git reset --hard HEAD && git clean -fd", { cwd: testRepoPath }); + + const escapedFileName = 'tab\tquote"file.txt'; + writeFileSync(join(testRepoPath, escapedFileName), "before\n"); + execSync("git add . && git commit -m 'Add escaped path file'", { cwd: testRepoPath }); + + rmSync(join(testRepoPath, escapedFileName)); + + const diff = execSync("git diff HEAD", { cwd: testRepoPath, encoding: "utf-8" }); + const fileDiffs = parseDiff(diff); + + expect(fileDiffs).toHaveLength(1); + expect(fileDiffs[0].changeType).toBe("deleted"); + expect(fileDiffs[0].filePath).toBe(escapedFileName); + expect(fileDiffs[0].oldPath).toBe(escapedFileName); + expect(fileDiffs[0].hunks).toHaveLength(1); + expect(fileDiffs[0].hunks[0].oldPath).toBe(escapedFileName); + expect(fileDiffs[0].hunks[0].content).toContain("-before"); + }); + + it("should preserve nested paths in --no-prefix diffs", () => { + execSync("git reset --hard HEAD && git clean -fd", { cwd: testRepoPath }); + execSync("mkdir -p src/common", { cwd: testRepoPath }); + writeFileSync(join(testRepoPath, "src", "common", "no-prefix.ts"), "old line\n"); + execSync("git add src/common/no-prefix.ts && git commit -m 'Add nested no-prefix file'", { + cwd: testRepoPath, + }); + + writeFileSync(join(testRepoPath, "src", "common", "no-prefix.ts"), "new line\n"); + + const diff = execSync("git diff --no-prefix HEAD", { cwd: testRepoPath, encoding: "utf-8" }); + const fileDiffs = parseDiff(diff); + + expect(fileDiffs).toHaveLength(1); + expect(fileDiffs[0].filePath).toBe("src/common/no-prefix.ts"); + expect(fileDiffs[0].oldPath).toBeUndefined(); + expect(fileDiffs[0].hunks).toHaveLength(1); + expect(fileDiffs[0].hunks[0].content).toContain("+new line"); + }); + + it("should parse real mnemonic-prefix diffs", () => { + execSync("git reset --hard HEAD && git clean -fd", { cwd: testRepoPath }); + execSync("mkdir -p src/mnemonic", { cwd: testRepoPath }); + writeFileSync(join(testRepoPath, "src", "mnemonic", "real.ts"), "before\n"); + execSync("git add src/mnemonic/real.ts && git commit -m 'Add mnemonic test file'", { + cwd: testRepoPath, + }); + + writeFileSync(join(testRepoPath, "src", "mnemonic", "real.ts"), "after\n"); + + const diff = execSync("git -c diff.mnemonicPrefix=true diff HEAD", { + cwd: testRepoPath, + encoding: "utf-8", + }); + const fileDiffs = parseDiff(diff); + + expect(fileDiffs).toHaveLength(1); + expect(fileDiffs[0].filePath).toBe("src/mnemonic/real.ts"); + expect(fileDiffs[0].oldPath).toBeUndefined(); + expect(fileDiffs[0].hunks[0].content).toContain("+after"); + }); + + it("should parse --no-prefix additions that use /dev/null", () => { + execSync("git reset --hard HEAD && git clean -fd", { cwd: testRepoPath }); + execSync("mkdir -p nested/dir", { cwd: testRepoPath }); + writeFileSync(join(testRepoPath, "nested", "dir", "added-no-prefix.ts"), "added\n"); + execSync("git add nested/dir/added-no-prefix.ts", { cwd: testRepoPath }); + + const diff = execSync("git diff --cached --no-prefix", { + cwd: testRepoPath, + encoding: "utf-8", + }); + const fileDiffs = parseDiff(diff); + + expect(fileDiffs).toHaveLength(1); + expect(fileDiffs[0].changeType).toBe("added"); + expect(fileDiffs[0].filePath).toBe("nested/dir/added-no-prefix.ts"); + expect(fileDiffs[0].oldPath).toBeUndefined(); + expect(fileDiffs[0].hunks[0].header).toMatch(/^@@ -0,0 \+1(?:,1)? @@/); + }); + + it("should parse --no-prefix deletions that use /dev/null", () => { + execSync("git reset --hard HEAD && git clean -fd", { cwd: testRepoPath }); + execSync("mkdir -p deleted/nested", { cwd: testRepoPath }); + writeFileSync(join(testRepoPath, "deleted", "nested", "gone.ts"), "gone\n"); + execSync("git add deleted/nested/gone.ts && git commit -m 'Add nested deleted file'", { + cwd: testRepoPath, + }); + + execSync("rm deleted/nested/gone.ts", { cwd: testRepoPath }); + + const diff = execSync("git diff --no-prefix HEAD", { cwd: testRepoPath, encoding: "utf-8" }); + const fileDiffs = parseDiff(diff); + + expect(fileDiffs).toHaveLength(1); + expect(fileDiffs[0].changeType).toBe("deleted"); + expect(fileDiffs[0].filePath).toBe("deleted/nested/gone.ts"); + expect(fileDiffs[0].oldPath).toBe("deleted/nested/gone.ts"); + expect(fileDiffs[0].hunks[0].content).toContain("-gone"); + }); + it("should normalize CRLF diff output (no \\r in hunk content)", () => { const diffOutput = [ @@ -154,10 +319,10 @@ describe("git diff parser (real repository)", () => { }); it("should parse file deletion", () => { - // Reset and commit newfile - execSync("git add . && git commit -m 'Add newfile'", { cwd: testRepoPath }); + execSync("git reset --hard HEAD && git clean -fd", { cwd: testRepoPath }); + writeFileSync(join(testRepoPath, "newfile.md"), "# New File\n\nContent here\n"); + execSync("git add newfile.md && git commit -m 'Add newfile'", { cwd: testRepoPath }); - // Delete file execSync("rm newfile.md", { cwd: testRepoPath }); const diff = execSync("git diff HEAD", { cwd: testRepoPath, encoding: "utf-8" }); @@ -274,6 +439,28 @@ describe("git diff parser (real repository)", () => { expect(allHunks.every((h) => h.id && h.id.length > 0)).toBe(true); }); + it("should normalize quoted rename metadata paths", () => { + execSync("git reset --hard HEAD && git clean -fd", { cwd: testRepoPath }); + + const originalFileName = 'tab\tquote"name.txt'; + const renamedFileName = 'tab\tquote"renamed.txt'; + writeFileSync(join(testRepoPath, originalFileName), "before\n"); + execSync("git add . && git commit -m 'Add quoted rename source file'", { cwd: testRepoPath }); + + execSync(`git mv '${originalFileName}' '${renamedFileName}'`, { + cwd: testRepoPath, + }); + + const diff = execSync("git diff --cached -M", { cwd: testRepoPath, encoding: "utf-8" }); + const fileDiffs = parseDiff(diff); + + expect(fileDiffs).toHaveLength(1); + expect(fileDiffs[0].changeType).toBe("renamed"); + expect(fileDiffs[0].filePath).toBe(renamedFileName); + expect(fileDiffs[0].oldPath).toBe(originalFileName); + expect(fileDiffs[0].hunks).toHaveLength(0); + }); + it("should handle pure file rename (no content changes)", () => { // Reset execSync("git reset --hard HEAD", { cwd: testRepoPath }); diff --git a/src/common/utils/git/diffParser.ts b/src/common/utils/git/diffParser.ts index 85e04621a8..e5a7ba96e6 100644 --- a/src/common/utils/git/diffParser.ts +++ b/src/common/utils/git/diffParser.ts @@ -48,6 +48,238 @@ function parseHunkHeader(line: string): { }; } +interface ParsedDiffPathLabel { + raw: string; + prefix: string | null; + path: string | null; +} + +function decodeGitCString(value: string): string { + const bytes: number[] = []; + const encoder = new TextEncoder(); + const decoder = new TextDecoder(); + + for (let index = 0; index < value.length; index++) { + const char = value[index]; + if (char !== "\\") { + bytes.push(...encoder.encode(char)); + continue; + } + + index += 1; + if (index >= value.length) { + bytes.push(0x5c); + break; + } + + const escape = value[index]; + if (/[0-7]/.test(escape)) { + let octal = escape; + while (index + 1 < value.length && octal.length < 3 && /[0-7]/.test(value[index + 1])) { + index += 1; + octal += value[index]; + } + bytes.push(parseInt(octal, 8)); + continue; + } + + switch (escape) { + case "a": + bytes.push(0x07); + break; + case "b": + bytes.push(0x08); + break; + case "t": + bytes.push(0x09); + break; + case "n": + bytes.push(0x0a); + break; + case "v": + bytes.push(0x0b); + break; + case "f": + bytes.push(0x0c); + break; + case "r": + bytes.push(0x0d); + break; + case '"': + bytes.push(0x22); + break; + case "\\": + bytes.push(0x5c); + break; + default: + bytes.push(...encoder.encode(escape)); + break; + } + } + + return decoder.decode(new Uint8Array(bytes)); +} + +function parseQuotedDiffLabel( + value: string, + startIndex = 0 +): { rawLabel: string; label: string; nextIndex: number } | null { + if (value[startIndex] !== '"') { + return null; + } + + let escaped = false; + for (let index = startIndex + 1; index < value.length; index++) { + const char = value[index]; + if (!escaped && char === '"') { + return { + rawLabel: value.slice(startIndex, index + 1), + label: decodeGitCString(value.slice(startIndex + 1, index)), + nextIndex: index + 1, + }; + } + + escaped = !escaped && char === "\\"; + } + + return null; +} + +function normalizeDiffPathLabel(label: string | undefined): string | undefined { + if (label == null) { + return undefined; + } + + const quotedLabel = parseQuotedDiffLabel(label); + if (quotedLabel) { + return quotedLabel.label; + } + + // Preserve literal trailing spaces in file names while still stripping the tab separator + // Git appends to unquoted patch labels before any optional timestamp metadata. + const tabIndex = label.indexOf("\t"); + const normalizedLabel = tabIndex === -1 ? label : label.slice(0, tabIndex); + return normalizedLabel.length === 0 ? undefined : normalizedLabel; +} + +function parseDiffPathLabel(label: string | undefined): ParsedDiffPathLabel | null { + const normalizedLabel = normalizeDiffPathLabel(label); + if (normalizedLabel == null) { + return null; + } + + if (normalizedLabel === "/dev/null") { + return { + raw: normalizedLabel, + prefix: null, + path: null, + }; + } + + const slashIndex = normalizedLabel.indexOf("/"); + if (slashIndex === -1) { + return { + raw: normalizedLabel, + prefix: null, + path: normalizedLabel, + }; + } + + return { + raw: normalizedLabel, + prefix: normalizedLabel.slice(0, slashIndex), + path: normalizedLabel.slice(slashIndex + 1), + }; +} + +function parseDiffGitHeaderLabels(line: string): { + oldLabel: string | undefined; + newLabel: string | undefined; +} { + const labelSection = line.slice("diff --git ".length); + const quotedOldLabel = parseQuotedDiffLabel(labelSection); + if (quotedOldLabel) { + const separatorMatch = /\s+/.exec(labelSection.slice(quotedOldLabel.nextIndex)); + if (separatorMatch) { + const quotedNewLabel = parseQuotedDiffLabel( + labelSection, + quotedOldLabel.nextIndex + separatorMatch.index + separatorMatch[0].length + ); + if (quotedNewLabel) { + return { + oldLabel: quotedOldLabel.rawLabel, + newLabel: quotedNewLabel.rawLabel, + }; + } + } + } + + const simpleLabels = labelSection.split(" "); + if (simpleLabels.length === 2) { + return { + oldLabel: simpleLabels[0], + newLabel: simpleLabels[1], + }; + } + + // Git can emit unquoted `diff --git` headers for paths with spaces, so look for the split + // that preserves the paired path on both sides instead of tokenizing on every space. + for (let index = 0; index < labelSection.length; index++) { + if (labelSection[index] !== " ") { + continue; + } + + const oldLabel = labelSection.slice(0, index); + const newLabel = labelSection.slice(index + 1); + const parsedOldLabel = parseDiffPathLabel(oldLabel); + const parsedNewLabel = parseDiffPathLabel(newLabel); + if ( + parsedOldLabel?.path != null && + parsedNewLabel?.path != null && + parsedOldLabel.path === parsedNewLabel.path + ) { + return { + oldLabel, + newLabel, + }; + } + } + + return { + oldLabel: undefined, + newLabel: undefined, + }; +} + +function choosePairedDiffLabel( + primaryLabel: string | undefined, + fallbackLabel: string | undefined +): string | undefined { + return primaryLabel != null && primaryLabel !== "/dev/null" ? primaryLabel : fallbackLabel; +} + +function canonicalizeDiffPathLabel( + label: string | undefined, + pairedLabel: string | undefined +): string | undefined { + const parsedLabel = parseDiffPathLabel(label); + if (parsedLabel?.path == null) { + return undefined; + } + + const parsedPair = parseDiffPathLabel(pairedLabel); + if ( + parsedLabel.prefix && + parsedPair?.prefix && + parsedLabel.prefix !== parsedPair.prefix && + parsedLabel.path === parsedPair.path + ) { + return parsedLabel.path; + } + + return parsedLabel.raw; +} + /** * Parse unified diff output into structured file diffs with hunks * Supports standard git diff format with file headers and hunk markers @@ -63,6 +295,52 @@ export function parseDiff(diffOutput: string): FileDiff[] { let currentFile: FileDiff | null = null; let currentHunk: Partial | null = null; let hunkLines: string[] = []; + let currentHeaderOldLabel: string | undefined; + let currentHeaderNewLabel: string | undefined; + let currentPatchOldLabel: string | undefined; + let currentPatchNewLabel: string | undefined; + let currentRenameFrom: string | undefined; + let currentRenameTo: string | undefined; + + const syncCurrentFilePaths = () => { + if (!currentFile) { + return; + } + + const resolvedOldPath = + currentRenameFrom ?? + canonicalizeDiffPathLabel( + currentPatchOldLabel ?? currentHeaderOldLabel, + choosePairedDiffLabel(currentPatchNewLabel, currentHeaderNewLabel) + ); + const resolvedNewPath = + currentRenameTo ?? + canonicalizeDiffPathLabel( + currentPatchNewLabel ?? currentHeaderNewLabel, + choosePairedDiffLabel(currentPatchOldLabel, currentHeaderOldLabel) + ); + const filePath = resolvedNewPath ?? resolvedOldPath; + if (filePath) { + currentFile.filePath = filePath; + } + + currentFile.oldPath = + resolvedOldPath && + (currentFile.changeType === "deleted" || + currentFile.changeType === "renamed" || + (resolvedNewPath != null && resolvedOldPath !== resolvedNewPath)) + ? resolvedOldPath + : undefined; + }; + + const resetCurrentFileLabels = () => { + currentHeaderOldLabel = undefined; + currentHeaderNewLabel = undefined; + currentPatchOldLabel = undefined; + currentPatchNewLabel = undefined; + currentRenameFrom = undefined; + currentRenameTo = undefined; + }; const finishHunk = () => { if (currentHunk && currentFile && hunkLines.length > 0) { @@ -89,29 +367,29 @@ export function parseDiff(diffOutput: string): FileDiff[] { const finishFile = () => { finishHunk(); if (currentFile) { + syncCurrentFilePaths(); files.push(currentFile); currentFile = null; } + resetCurrentFileLabels(); }; for (const line of lines) { - // File header: diff --git a/... b/... + // File header: git emits path labels here, but they are not guaranteed to be literal a/ and b/. if (line.startsWith("diff --git ")) { finishFile(); - // Extract file paths from "diff --git a/path b/path" - const regex = /^diff --git a\/(.+) b\/(.+)$/; - const match = regex.exec(line); - if (match) { - const oldPath = match[1]; - const newPath = match[2]; - currentFile = { - filePath: newPath, - oldPath: oldPath !== newPath ? oldPath : undefined, - changeType: "modified", - isBinary: false, - hunks: [], - }; - } + // Extract the paired labels without assuming literal a/ and b/ prefixes or space-free paths. + const parsedHeaderLabels = parseDiffGitHeaderLabels(line); + currentHeaderOldLabel = parsedHeaderLabels.oldLabel; + currentHeaderNewLabel = parsedHeaderLabels.newLabel; + currentFile = { + filePath: "", + oldPath: undefined, + changeType: "modified", + isBinary: false, + hunks: [], + }; + syncCurrentFilePaths(); continue; } @@ -126,18 +404,40 @@ export function parseDiff(diffOutput: string): FileDiff[] { // New file mode if (line.startsWith("new file mode ")) { currentFile.changeType = "added"; + syncCurrentFilePaths(); continue; } // Deleted file mode if (line.startsWith("deleted file mode ")) { currentFile.changeType = "deleted"; + syncCurrentFilePaths(); + continue; + } + + if (!currentHunk && line.startsWith("--- ")) { + currentPatchOldLabel = line.slice(4); + syncCurrentFilePaths(); + continue; + } + + if (!currentHunk && line.startsWith("+++ ")) { + currentPatchNewLabel = line.slice(4); + syncCurrentFilePaths(); + continue; + } + + if (line.startsWith("rename from ")) { + currentFile.changeType = "renamed"; + currentRenameFrom = normalizeDiffPathLabel(line.slice("rename from ".length)); + syncCurrentFilePaths(); continue; } - // Rename marker - if (line.startsWith("rename from ") || line.startsWith("rename to ")) { + if (line.startsWith("rename to ")) { currentFile.changeType = "renamed"; + currentRenameTo = normalizeDiffPathLabel(line.slice("rename to ".length)); + syncCurrentFilePaths(); continue; }