From add4c98ab2693a70878de02bffd2c8e41a0fc8b5 Mon Sep 17 00:00:00 2001 From: Jo Humphrey <31373245+jamdelion@users.noreply.github.com> Date: Thu, 30 Apr 2026 14:40:52 +0100 Subject: [PATCH 1/7] Add transformations doc --- README.md | 4 ++ doc/transformations.md | 111 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 doc/transformations.md diff --git a/README.md b/README.md index 6647096..fbd4615 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,10 @@ Note - NTTT will work on Windows, macOS and Linux. +## Documentation + +For maintainers, [doc/transformations.md](doc/transformations.md) describes what NTTT changes in `meta.yml` and Markdown files (sections, HTML, formatting, URLs, and related behaviour). + ## Prerequisites The tool requires having Python 3.7 or newer. diff --git a/doc/transformations.md b/doc/transformations.md new file mode 100644 index 0000000..733a174 --- /dev/null +++ b/doc/transformations.md @@ -0,0 +1,111 @@ +# NTTT: transformations reference + +This document describes what **Nina's Translation Tidy-up Tool (NTTT)** changes on disk, so maintainers know what to expect and where to look in code. + +## Scope + +- **Inputs:** Files under the chosen **input** directory. The tool collects every `meta.yml` and every `*.md` (see `find_files` in [`nttt/utilities.py`](../nttt/utilities.py)). +- **English reference:** A parallel tree (default: `INPUT/../en`) used for `meta.yml` sync and optional section-tag revert. +- **Outputs:** Corresponding paths under the **output** directory (created as needed). After processing, **missing** files/folders can be copied from input and English (`add_missing_entries`). + +NTTT does **not** process standalone `.html` files. HTML-related steps run on **HTML inside Markdown**. + +--- + +## High-level pipeline (`fix_md_step`) + +For each `.md` file, [`nttt/tidyup.py`](../nttt/tidyup.py) applies, in order: + +1. **`fix_sections`** — normalise `---` section lines (Crowdin quirks). +2. **`revert_section_translation`** — optional; restore English section tag lines when structure matches. +3. **`trim_md_tags`** — strip padding inside paired Markdown delimiters (outside ` ``` ` fences). +4. **`trim_html_tags`** — strip padding inside simple inline HTML tags (outside single `` ` `` spans). +5. **`trim_formatting_tags`** — normalise `{ … }` attribute blocks after a word (Scratch/Pico-style). +6. **URL rewrite:** replace `/en/` with `//` everywhere in the file body. + +Steps 1–5 can be skipped via **`--disable`** (see [`nttt/arguments.py`](../nttt/arguments.py)). + +`meta.yml` is handled separately by **`fix_meta`** (YAML round-trip, revert non-translatable keys from English). This doc focuses on Markdown/HTML-style transforms. + +--- + +## 1. Section markers (`nttt/cleanup_sections.py`) + +**Function:** `fix_sections` + +| Behaviour | Purpose | +|-----------|---------| +| Replace `\---` with `---` | Crowdin sometimes escapes section markers. | +| Normalise `--` / `---` wrappers around section names | Fix missing dash or inconsistent spacing; target form **`--- ---`**. Tags allow word chars, digits, hyphens, and certain Unicode space characters inside the name. | +| Normalise closing sections | **`--- /tag ---`** — removes extra spaces between `/` and the tag name. | +| Split jammed section lines | Restore newline between adjacent **`--- … ---`** lines when Crowdin merges them (e.g. hints/hint); regex also tolerates some translator edits. | +| Repair broken collapse/title blocks | Restore **`--- collapse ---`** plus YAML-style **`title:`** block when Crowdin breaks the structure; colons may be ASCII or full-width (`:`). | + +**Function:** `revert_section_translation` (requires English `.md`) + +- Collects lines matching **`--- ---`** in translation and English. +- If **counts match**, replaces each translated section line with the **English** line at the same index (keeps English tag names, e.g. `task` vs translated word). +- If counts differ, logs a **warning** to stderr and leaves the file unchanged for this step. + +--- + +## 2. Markdown delimiters (`nttt/cleanup_markdown.py`) + +**Function:** `trim_md_tags` + +- Splits content on **` ``` `** (triple backtick). **`apply_to_every_other_part`** runs trimming only on segments **outside** fenced blocks (indices 0, 2, 4, …); fence interiors are untouched. +- Per line outside fences: + - **List lines:** odd number of `*` and line starts with `*` after `lstrip` → only the substring **after the first `*`** is trimmed (preserves the bullet marker). + - Otherwise the **whole line** is trimmed. +- **Trim rule:** regex finds paired **`` ` ``**, **`_` … `___`**, or **`*` … `***`** wrapping content; inner content is **`.strip()`**; delimiters unchanged. + +Logging can record each replacement (`log_replacement`). + +--- + +## 3. Inline HTML (`nttt/cleanup_html.py`) + +**Function:** `trim_html_tags` + +- Splits on **single** `` ` ``. Only **even-index** segments are processed; **inline code** segments are preserved. +- Matches **paired** tags: `` where `tagName` is **word characters + digits only** (no hyphenated custom elements in the pattern). Inner HTML is **`.strip()`**. +- Does **not** handle attributes on the opening tag, self-closing tags, or arbitrary XML namespaces. + +--- + +## 4. Formatting braces (`nttt/cleanup_formatting.py`) + +**Function:** `trim_formatting_tags` + +- Single-pass regex over the **entire** file (no code-fence splitting). +- Targets patterns like **`word { … key = "value" … }`** with flexible Unicode spaces, colons, and quotes (see [`nttt/constants.py`](../nttt/constants.py) `RegexConstants`). +- **Lowercases** the attribute name and value. +- Normalises "blank" link targets: values matching **`_` + spaces + `blank`** → **`_blank`**. + +--- + +## 5. Locale URLs (`nttt/tidyup.py`) + +After cleanup: **replace every `/en/` with `//`** in the Markdown file (`language` from resolved CLI args, defaulting from input folder basename). + +--- + +## Operational notes + +- **Confirmation:** Unless **`-Y`**, the tool lists files and waits for **`y`** before writing. +- **Volunteer acknowledgements / missing files:** Separate from Markdown transforms; see `add_volunteer_acknowledgement` and `add_missing_entries` in [`nttt/tidyup.py`](../nttt/tidyup.py). +- **Logging:** Several modules accept a `logging` object for replacement traces (`nttt_logging`). + +--- + +## Quick code map + +| Concern | Module | +|---------|--------| +| Orchestration | `nttt/tidyup.py`, `nttt/__init__.py` | +| CLI / disable flags | `nttt/arguments.py` | +| Sections | `nttt/cleanup_sections.py` | +| Markdown emphasis / code delimiters | `nttt/cleanup_markdown.py` | +| Inline HTML | `nttt/cleanup_html.py` | +| Brace attributes | `nttt/cleanup_formatting.py` | +| Split "every other segment" | `nttt/utilities.py` → `apply_to_every_other_part` | From e95e39310e1f7ed3c99ff5c0156e5cb203771ee8 Mon Sep 17 00:00:00 2001 From: Jo Humphrey <31373245+jamdelion@users.noreply.github.com> Date: Thu, 30 Apr 2026 16:56:29 +0100 Subject: [PATCH 2/7] Get AI to implement everything --- README.md | 29 ++++ doc/transformations.md | 78 ++++++++-- nttt/__init__.py | 59 ++++++++ nttt/arguments.py | 83 +++++++---- nttt/cleanup_alerts.py | 100 +++++++++++++ nttt/cleanup_codeblocks.py | 130 +++++++++++++++++ nttt/restore.py | 94 +++++++++++++ nttt/strip.py | 203 +++++++++++++++++++++++++++ nttt/tidyup.py | 11 ++ unit_test/test_cleanup_alerts.py | 39 +++++ unit_test/test_cleanup_codeblocks.py | 40 ++++++ unit_test/test_strip_restore.py | 106 ++++++++++++++ 12 files changed, 937 insertions(+), 35 deletions(-) create mode 100644 nttt/cleanup_alerts.py create mode 100644 nttt/cleanup_codeblocks.py create mode 100644 nttt/restore.py create mode 100644 nttt/strip.py create mode 100644 unit_test/test_cleanup_alerts.py create mode 100644 unit_test/test_cleanup_codeblocks.py create mode 100644 unit_test/test_strip_restore.py diff --git a/README.md b/README.md index fbd4615..09c374b 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,23 @@ You can specify different directories for the input and output folder using the nttt --input c:\path\to\project\de-DE --output c:\path\to\project\de-DE-tidy ``` +### Crowdin strip and restore + +NTTT can also prepare English source for Crowdin and restore non-translatable markers after translated content is downloaded. + +```bash +# Before upload: hide non-translatable markers from Crowdin. +nttt strip --input en --output .crowdin-staging/en + +# After download: restore markers into a translated locale. +nttt restore --input fr-FR --english en --output fr-FR + +# Restore and then run the usual tidy-up in one command. +nttt restore --input fr-FR --english en --output fr-FR --then-tidyup --Yes on +``` + +`strip` replaces legacy section markers, modern alert type tokens, codeblock info strings, inline kramdown class metadata, and non-translatable `meta.yml` keys with deterministic placeholders. `restore` regenerates the placeholder map from `en/`, so no sidecar files need to be persisted between workflows. See [doc/transformations.md](doc/transformations.md) for details. + ### Help To bring up full usage information use the `-h`/`--help` option. @@ -114,6 +131,8 @@ To bring up full usage information use the `-h`/`--help` option. nttt -h usage: nttt [-h] [-i INPUT] [-o OUTPUT] [-e ENGLISH] [-l LANGUAGE] [-v VOLUNTEERS] [-f FINAL] + [-D DISABLE] [-L LOGGING] [-Y YES] + {strip,restore} ... Nina's Translation Tidyup Tool @@ -138,6 +157,16 @@ optional arguments: -f FINAL, --final FINAL The number of the final step file, defaults to the step file with the highest number. + -D DISABLE, --Disable DISABLE + The risky features to be disabled, separated by commas. + Options include fix_md, fix_html, fix_sections, + revert_section_translation, fix_alerts, + revert_alert_translation, fix_codeblocks, and + fix_formatting. + +subcommands: + strip Strip non-translatable markers for Crowdin upload. + restore Restore non-translatable markers after Crowdin download. examples of usage: nttt diff --git a/doc/transformations.md b/doc/transformations.md index 733a174..bbdfd3b 100644 --- a/doc/transformations.md +++ b/doc/transformations.md @@ -17,11 +17,14 @@ NTTT does **not** process standalone `.html` files. HTML-related steps run on ** For each `.md` file, [`nttt/tidyup.py`](../nttt/tidyup.py) applies, in order: 1. **`fix_sections`** — normalise `---` section lines (Crowdin quirks). -2. **`revert_section_translation`** — optional; restore English section tag lines when structure matches. -3. **`trim_md_tags`** — strip padding inside paired Markdown delimiters (outside ` ``` ` fences). -4. **`trim_html_tags`** — strip padding inside simple inline HTML tags (outside single `` ` `` spans). -5. **`trim_formatting_tags`** — normalise `{ … }` attribute blocks after a word (Scratch/Pico-style). -6. **URL rewrite:** replace `/en/` with `//` everywhere in the file body. +2. **`fix_alerts`** — normalise modern `> [!TYPE]` alert markers. +3. **`revert_section_translation`** — optional; restore English section tag lines when structure matches. +4. **`revert_alert_translation`** — optional; restore English alert type tokens when structure matches. +5. **`trim_md_tags`** — strip padding inside paired Markdown delimiters (outside ` ``` ` fences). +6. **`trim_html_tags`** — strip padding inside simple inline HTML tags (outside single `` ` `` spans). +7. **`fix_codeblocks`** — normalise modern fenced-code info strings. +8. **`trim_formatting_tags`** — normalise `{ … }` attribute blocks after a word (Scratch/Pico-style). +9. **URL rewrite:** replace `/en/` with `//` everywhere in the file body. Steps 1–5 can be skipped via **`--disable`** (see [`nttt/arguments.py`](../nttt/arguments.py)). @@ -49,7 +52,23 @@ Steps 1–5 can be skipped via **`--disable`** (see [`nttt/arguments.py`](../ntt --- -## 2. Markdown delimiters (`nttt/cleanup_markdown.py`) +## 2. Modern alerts (`nttt/cleanup_alerts.py`) + +**Function:** `fix_alerts` + +- Normalises modern alert markers such as `> [!TASK]`, `> [!HINT]`, `> [!ACCORDION] Title`, and nested markers such as `> > [!HINT]`. +- Fixes spacing and casing around the marker: `>[ ! task ]` → `> [!TASK]`. +- Preserves any title text after the marker because it is translatable. + +**Function:** `revert_alert_translation` (requires English `.md`) + +- Collects alert marker lines in translation and English. +- If **counts and nesting depth match**, replaces only the translated alert type token with the English token. +- If structure differs, logs a warning and leaves alert types unchanged for this step. + +--- + +## 3. Markdown delimiters (`nttt/cleanup_markdown.py`) **Function:** `trim_md_tags` @@ -63,7 +82,7 @@ Logging can record each replacement (`log_replacement`). --- -## 3. Inline HTML (`nttt/cleanup_html.py`) +## 4. Inline HTML (`nttt/cleanup_html.py`) **Function:** `trim_html_tags` @@ -73,7 +92,20 @@ Logging can record each replacement (`log_replacement`). --- -## 4. Formatting braces (`nttt/cleanup_formatting.py`) +## 5. Codeblock info strings (`nttt/cleanup_codeblocks.py`) + +**Function:** `fix_codeblocks` + +- Normalises opening fenced-code lines such as ```` ```python filename="button.py" ````. +- Lowercases the language token and attribute keys/values. +- Normalises quotes and spacing around `=`. +- Collapses spaces inside `line_highlights`, e.g. `"3, 5-6"` → `"3,5-6"`. +- If the translated language token is not recognised and the English file is available, restores the English language token at the same fence index. +- Does **not** change code inside the block or closing fences. + +--- + +## 6. Formatting braces (`nttt/cleanup_formatting.py`) **Function:** `trim_formatting_tags` @@ -84,7 +116,7 @@ Logging can record each replacement (`log_replacement`). --- -## 5. Locale URLs (`nttt/tidyup.py`) +## 7. Locale URLs (`nttt/tidyup.py`) After cleanup: **replace every `/en/` with `//`** in the Markdown file (`language` from resolved CLI args, defaulting from input folder basename). @@ -98,6 +130,31 @@ After cleanup: **replace every `/en/` with `//`** in the Markdown file --- +## Strip / restore workflow + +`nttt strip` prepares English source for Crowdin by replacing non-translatable markers with deterministic placeholders. `nttt restore` regenerates the same placeholder map from `en/` and re-injects the markers into translated files after Crowdin download. + +Typical workflow: + +1. Upload side: `nttt strip -i en/ -o .crowdin-staging/en/` +2. Crowdin translates `.crowdin-staging/en/` +3. Download side: `nttt restore -i fr-FR/ -e en/ -o fr-FR/` +4. Existing tidy-up: `nttt -i fr-FR/ -Y YES` + +Markdown placeholders use HTML comments such as ``. Configure Crowdin to treat `` as non-translatable. + +`strip` currently hides: + +- legacy section marker lines such as `--- task ---` and `--- /task ---` +- modern alert type tokens such as `[!TASK]` +- modern fenced-code info strings such as `python filename="button.py"` +- inline kramdown class metadata such as `{:class="block3looks"}` +- non-translatable `meta.yml` keys by dropping anything outside `title`, `description`, `steps`, `meta_title`, and `meta_description` + +`restore` is safe to run on older Crowdin downloads that do not contain placeholders; it is a no-op for those files. It warns, but does not fail, if placeholders are missing or unknown. + +--- + ## Quick code map | Concern | Module | @@ -105,7 +162,10 @@ After cleanup: **replace every `/en/` with `//`** in the Markdown file | Orchestration | `nttt/tidyup.py`, `nttt/__init__.py` | | CLI / disable flags | `nttt/arguments.py` | | Sections | `nttt/cleanup_sections.py` | +| Modern alerts | `nttt/cleanup_alerts.py` | | Markdown emphasis / code delimiters | `nttt/cleanup_markdown.py` | | Inline HTML | `nttt/cleanup_html.py` | +| Codeblock info strings | `nttt/cleanup_codeblocks.py` | | Brace attributes | `nttt/cleanup_formatting.py` | +| Strip / restore | `nttt/strip.py`, `nttt/restore.py` | | Split "every other segment" | `nttt/utilities.py` → `apply_to_every_other_part` | diff --git a/nttt/__init__.py b/nttt/__init__.py index 9976cfb..348532b 100644 --- a/nttt/__init__.py +++ b/nttt/__init__.py @@ -1,10 +1,69 @@ +import os from .arguments import parse_command_line, resolve_arguments, check_arguments, show_arguments +from .arguments import get_absolute_path, get_final_step +from .constants import ArgumentKeyConstants +from .restore import restore_tree +from .strip import strip_tree from .tidyup import tidyup_translations from ._version import __version__ + def main(): command_line_args = parse_command_line(__version__) + command = getattr(command_line_args, "command", None) + + if command == "strip": + strip_tree( + get_absolute_path(command_line_args.input), + get_absolute_path(command_line_args.output), + command_line_args.debug_sidecars) + return + + if command == "restore": + restore_tree( + get_absolute_path(command_line_args.input), + get_absolute_path(command_line_args.english), + get_absolute_path(command_line_args.output)) + + if command_line_args.then_tidyup: + restored_arguments = build_restore_tidyup_arguments(command_line_args) + show_arguments(restored_arguments) + if check_arguments(restored_arguments): + tidyup_translations(restored_arguments) + return + resolved_arguments = resolve_arguments(command_line_args) show_arguments(resolved_arguments) if (check_arguments(resolved_arguments)): tidyup_translations(resolved_arguments) + + +def build_restore_tidyup_arguments(command_line_args): + input_folder = get_absolute_path(command_line_args.input) + output_folder = get_absolute_path(command_line_args.output) + english_folder = get_absolute_path(command_line_args.english) + + arguments = {} + arguments[ArgumentKeyConstants.INPUT] = output_folder + arguments[ArgumentKeyConstants.OUTPUT] = output_folder + arguments[ArgumentKeyConstants.ENGLISH] = english_folder + arguments[ArgumentKeyConstants.LANGUAGE] = os.path.basename(input_folder) + arguments[ArgumentKeyConstants.VOLUNTEERS] = [] + arguments[ArgumentKeyConstants.FINAL] = get_final_step(output_folder) + + if getattr(command_line_args, "Disable", False): + arguments[ArgumentKeyConstants.DISABLE] = command_line_args.Disable.split(",") + else: + arguments[ArgumentKeyConstants.DISABLE] = [] + + if getattr(command_line_args, "Logging", False): + arguments[ArgumentKeyConstants.LOGGING] = command_line_args.Logging + else: + arguments[ArgumentKeyConstants.LOGGING] = "off" + + if getattr(command_line_args, "Yes", False): + arguments[ArgumentKeyConstants.YES] = command_line_args.Yes + else: + arguments[ArgumentKeyConstants.YES] = "off" + + return arguments diff --git a/nttt/arguments.py b/nttt/arguments.py index 35f76b0..9ea8138 100644 --- a/nttt/arguments.py +++ b/nttt/arguments.py @@ -4,6 +4,45 @@ from argparse import ArgumentParser +def add_tidyup_arguments(parser): + parser.add_argument("-i", "--input", help="The input directory which contains the content to tidy up, defaults to the current directory.") + parser.add_argument("-o", "--output", help="The output directory where the upgraded content should be written, defaults to the same as INPUT.") + parser.add_argument("-e", "--english", help="The directory which contains the English files and folders, defaults to INPUT/../en.") + parser.add_argument("-l", "--language", help="The language of the content to be tidied up, defaults to basename(INPUT).") + parser.add_argument("-v", "--volunteers", help="The list of volunteers as a comma separated list, defaults to an empty list.") + parser.add_argument("-f", "--final", help="The number of the final step file, defaults to the step file with the highest number.") + parser.add_argument("-D", "--Disable", help="The risky features to be disabled, separated by commas. " + "Options are: fix_md (fix common markdown-related issues), " + "fix_html (fix common issues in HTML-like tags (Return)), " + "fix_sections (fix common issues in section tags (--- hint ---)), " + "revert_section_translation (revert translation for section tags), " + "fix_alerts (fix common issues in modern alert tags (> [!TASK])), " + "revert_alert_translation (revert translation for modern alert types), " + "fix_codeblocks (fix common issues in fenced codeblock info strings), " + "fix_formatting (fix common issues in formatting tags ({:class=\"block3motion\"})). " + "Defaults to all risky features to be enabled.") + parser.add_argument("-L", "--Logging", help="Logging of modifications. Options are on and off. Default is off.") + parser.add_argument("-Y", "--Yes", help="Automatic yes to prompts. " + "If enabled assume 'yes' as answer to all prompts and run non-interactively. " + "Options are on and off. Default is off.") + + +def add_strip_arguments(parser): + parser.add_argument("-i", "--input", required=True, help="The English source directory to strip.") + parser.add_argument("-o", "--output", required=True, help="The output directory where the Crowdin-ready copy should be written.") + parser.add_argument("--debug-sidecars", action="store_true", help="Write optional .nttt.json sidecars for inspection.") + + +def add_restore_arguments(parser): + parser.add_argument("-i", "--input", required=True, help="The translated directory to restore.") + parser.add_argument("-e", "--english", required=True, help="The English source directory used to regenerate placeholders.") + parser.add_argument("-o", "--output", required=True, help="The output directory where restored files should be written.") + parser.add_argument("--then-tidyup", action="store_true", help="Run the tidy-up pipeline after restoring.") + parser.add_argument("-D", "--Disable", help="The tidy-up features to disable when --then-tidyup is used, separated by commas.") + parser.add_argument("-L", "--Logging", help="Logging of modifications. Options are on and off. Default is off.") + parser.add_argument("-Y", "--Yes", help="Automatic yes to prompts when --then-tidyup is used. Options are on and off. Default is off.") + + def get_absolute_path(folder): ''' Returns the absolute path for the given folder. Trailing path separators @@ -45,23 +84,15 @@ def parse_command_line(version): """ parser = ArgumentParser(description="Nina's Translation Tidyup Tool v{}".format(version)) - parser.add_argument("-i", "--input", help="The input directory which contains the content to tidy up, defaults to the current directory.") - parser.add_argument("-o", "--output", help="The output directory where the upgraded content should be written, defaults to the same as INPUT.") - parser.add_argument("-e", "--english", help="The directory which contains the English files and folders, defaults to INPUT/../en.") - parser.add_argument("-l", "--language", help="The language of the content to be tidied up, defaults to basename(INPUT).") - parser.add_argument("-v", "--volunteers", help="The list of volunteers as a comma separated list, defaults to an empty list.") - parser.add_argument("-f", "--final", help="The number of the final step file, defaults to the step file with the highest number.") - parser.add_argument("-D", "--Disable", help="The risky features to be disabled, separated by commas. " - "Options are: fix_md (fix common markdown-related issues), " - "fix_html (fix common issues in HTML-like tags (Return)), " - "fix_sections (fix common issues in section tags (--- hint ---)), " - "revert_section_translation (revert translation for section tags), " - "fix_formatting (fix common issues in formatting tags ({:class=\"block3motion\"})). " - "Defaults to all risky features to be enabled.") - parser.add_argument("-L", "--Logging", help="Logging of modifications. Options are on and off. Default is off.") - parser.add_argument("-Y", "--Yes", help="Automatic yes to prompts. " - "If enabled assume 'yes' as answer to all prompts and run non-interactively. " - "Options are on and off. Default is off.") + add_tidyup_arguments(parser) + subparsers = parser.add_subparsers(dest="command") + + strip_parser = subparsers.add_parser("strip", help="Strip non-translatable markers for Crowdin upload.") + add_strip_arguments(strip_parser) + + restore_parser = subparsers.add_parser("restore", help="Restore non-translatable markers after Crowdin download.") + add_restore_arguments(restore_parser) + return parser.parse_args() @@ -75,47 +106,47 @@ def resolve_arguments(command_line_args): arguments = {} - if command_line_args.input: + if getattr(command_line_args, "input", False): arguments[ArgumentKeyConstants.INPUT] = get_absolute_path(command_line_args.input) else: arguments[ArgumentKeyConstants.INPUT] = get_absolute_path('.') - if command_line_args.output: + if getattr(command_line_args, "output", False): arguments[ArgumentKeyConstants.OUTPUT] = get_absolute_path(command_line_args.output) else: arguments[ArgumentKeyConstants.OUTPUT] = arguments[ArgumentKeyConstants.INPUT] - if command_line_args.english: + if getattr(command_line_args, "english", False): arguments[ArgumentKeyConstants.ENGLISH] = get_absolute_path(command_line_args.english) else: arguments[ArgumentKeyConstants.ENGLISH] = Path(dirname(arguments[ArgumentKeyConstants.INPUT]), 'en') - if command_line_args.language: + if getattr(command_line_args, "language", False): arguments[ArgumentKeyConstants.LANGUAGE] = command_line_args.language else: arguments[ArgumentKeyConstants.LANGUAGE] = basename(arguments[ArgumentKeyConstants.INPUT]) - if command_line_args.volunteers: + if getattr(command_line_args, "volunteers", False): arguments[ArgumentKeyConstants.VOLUNTEERS] = [name.strip() for name in command_line_args.volunteers.split(',')] else: arguments[ArgumentKeyConstants.VOLUNTEERS] = [] - if command_line_args.final: + if getattr(command_line_args, "final", False): arguments[ArgumentKeyConstants.FINAL] = int(command_line_args.final) else: arguments[ArgumentKeyConstants.FINAL] = get_final_step(arguments[ArgumentKeyConstants.INPUT]) - if command_line_args.Disable: + if getattr(command_line_args, "Disable", False): arguments[ArgumentKeyConstants.DISABLE] = command_line_args.Disable.split(",") else: arguments[ArgumentKeyConstants.DISABLE] = [] - if command_line_args.Logging: + if getattr(command_line_args, "Logging", False): arguments[ArgumentKeyConstants.LOGGING] = command_line_args.Logging else: arguments[ArgumentKeyConstants.LOGGING] = "off" - if command_line_args.Yes: + if getattr(command_line_args, "Yes", False): arguments[ArgumentKeyConstants.YES] = command_line_args.Yes else: arguments[ArgumentKeyConstants.YES] = "off" diff --git a/nttt/cleanup_alerts.py b/nttt/cleanup_alerts.py new file mode 100644 index 0000000..11a39cd --- /dev/null +++ b/nttt/cleanup_alerts.py @@ -0,0 +1,100 @@ +import re +import sys +from .constants import RegexConstants +from .nttt_logging import log_replacement + + +ALERT_LINE_PATTERN = re.compile( + rf'^(?P[{RegexConstants.SPACES}]*)' + rf'(?P(?:>[{RegexConstants.SPACES}]*)+)' + rf'[\[[][{RegexConstants.SPACES}]*![{RegexConstants.SPACES}]*' + rf'(?P[^\]]]+?)' + rf'[{RegexConstants.SPACES}]*[\]]]' + rf'(?P.*)$' +) + + +def fix_alerts(md_file_content, logging): + lines = md_file_content.split('\n') + fixed_lines = [] + + for line in lines: + fixed_lines.append(fix_alert_line(line, logging)) + + return '\n'.join(fixed_lines) + + +def fix_alert_line(line, logging): + match = ALERT_LINE_PATTERN.match(line) + if match is None: + return line + + alert_type = match.group("alert_type").strip().upper() + prefix = normalise_prefix(match.group("prefix")) + replacement_text = "{}{}[!{}]{}".format( + match.group("indent"), + prefix, + alert_type, + match.group("rest")) + + log_replacement(line, replacement_text, logging) + return replacement_text + + +def normalise_prefix(prefix): + depth = prefix.count(">") + return "> " * depth + + +def revert_alert_translation(md_file_name, md_file_content, en_file_content, logging): + md_file_lines = md_file_content.split('\n') + md_alerts = extract_alerts(md_file_lines) + + en_file_lines = en_file_content.split('\n') + en_alerts = extract_alerts(en_file_lines) + + if len(md_alerts) == len(en_alerts): + for i in range(len(md_alerts)): + md_alert = md_alerts[i] + en_alert = en_alerts[i] + + if md_alert["depth"] != en_alert["depth"]: + return warn_and_skip(md_file_name, md_file_content) + + replacement_text = "{}{}[!{}]{}".format( + md_alert["indent"], + md_alert["prefix"], + en_alert["alert_type"], + md_alert["rest"]) + log_replacement(md_file_lines[md_alert["line_num"]], replacement_text, logging) + md_file_lines[md_alert["line_num"]] = replacement_text + + return '\n'.join(md_file_lines) + else: + return warn_and_skip(md_file_name, md_file_content) + + +def warn_and_skip(md_file_name, md_file_content): + print("Warning ({}): Different alert structure in the original (en) and the translated pages. " + "Reverting of translated alert types will not be performed".format(md_file_name), file=sys.stderr) + return md_file_content + + +def extract_alerts(md_file_lines): + alerts = [] + + for i in range(len(md_file_lines)): + line = fix_alert_line(md_file_lines[i], "off") + match = ALERT_LINE_PATTERN.match(line) + if match: + prefix = normalise_prefix(match.group("prefix")) + alerts.append({ + "line_num": i, + "indent": match.group("indent"), + "prefix": prefix, + "depth": prefix.count(">"), + "alert_type": match.group("alert_type").strip().upper(), + "rest": match.group("rest"), + }) + + return alerts diff --git a/nttt/cleanup_codeblocks.py b/nttt/cleanup_codeblocks.py new file mode 100644 index 0000000..9ac69b1 --- /dev/null +++ b/nttt/cleanup_codeblocks.py @@ -0,0 +1,130 @@ +import re +from .constants import RegexConstants +from .nttt_logging import log_replacement + + +FENCE_PATTERN = re.compile(r'^(?P\s*)```(?P.*)$') +ATTR_PATTERN = re.compile( + rf'(?P[\w-]+)[{RegexConstants.SPACES}]*=[{RegexConstants.SPACES}]*' + rf'(?P[{RegexConstants.QUOTES}])(?P.*?)(?P=quote)' +) + +KNOWN_LANGUAGES = { + "bash", + "c", + "cpp", + "css", + "html", + "javascript", + "js", + "json", + "markdown", + "python", + "scratch3", + "shell", + "text", + "typescript", + "yaml", +} + + +def fix_codeblocks(md_file_content, english_file_content, logging): + english_infos = [] + if english_file_content is not None: + english_infos = extract_opening_fence_infos(english_file_content) + + lines = md_file_content.split('\n') + fixed_lines = [] + in_fence = False + opening_fence_index = 0 + + for line in lines: + match = FENCE_PATTERN.match(line) + if match: + if not in_fence: + english_info = None + if opening_fence_index < len(english_infos): + english_info = english_infos[opening_fence_index] + + fixed_lines.append(fix_opening_fence(line, english_info, logging)) + opening_fence_index += 1 + in_fence = True + else: + fixed_lines.append(line) + in_fence = False + else: + fixed_lines.append(line) + + return '\n'.join(fixed_lines) + + +def extract_opening_fence_infos(md_file_content): + infos = [] + in_fence = False + + for line in md_file_content.split('\n'): + match = FENCE_PATTERN.match(line) + if match: + if not in_fence: + infos.append(match.group("info").strip()) + in_fence = True + else: + in_fence = False + + return infos + + +def fix_opening_fence(line, english_info, logging): + match = FENCE_PATTERN.match(line) + if match is None: + return line + + info = normalise_quotes(match.group("info").strip()) + if info == "": + return line + + fixed_info = normalise_info_string(info, english_info) + replacement_text = "{}```{}".format(match.group("indent"), fixed_info) + log_replacement(line, replacement_text, logging) + return replacement_text + + +def normalise_info_string(info, english_info=None): + attr_matches = list(ATTR_PATTERN.finditer(info)) + attr_start = attr_matches[0].start() if attr_matches else len(info) + lang = info[:attr_start].strip().lower() + + if lang not in KNOWN_LANGUAGES and english_info: + english_lang = extract_language(english_info) + if english_lang: + lang = english_lang + + attrs = [] + for match in attr_matches: + key = match.group("key").lower() + value = match.group("value").strip().lower() + if key == "line_highlights": + value = re.sub(r'\s+', '', value) + attrs.append('{}="{}"'.format(key, value)) + + if len(attrs) == 0: + return lang + + if lang == "": + return " ".join(attrs) + + return "{} {}".format(lang, " ".join(attrs)) + + +def extract_language(info): + info = normalise_quotes(info.strip()) + attr_match = ATTR_PATTERN.search(info) + if attr_match: + return info[:attr_match.start()].strip().lower() + return info.strip().lower() + + +def normalise_quotes(text): + for quote in RegexConstants.QUOTES: + text = text.replace(quote, '"') + return text diff --git a/nttt/restore.py b/nttt/restore.py new file mode 100644 index 0000000..d9714a2 --- /dev/null +++ b/nttt/restore.py @@ -0,0 +1,94 @@ +import io +import os +import shutil +import sys +from .constants import GeneralConstants +from .strip import PLACEHOLDER_PATTERN, TRANSLATABLE_META_KEYS, build_token_map, yaml_for_round_trip +from .utilities import get_file, save_file + + +def restore_tree(input_folder, english_folder, output_folder): + for dname, _, files in os.walk(input_folder): + for fname in files: + if fname.endswith(".nttt.json"): + continue + + source_file_path = os.path.join(dname, fname) + relative_file_name = os.path.relpath(source_file_path, input_folder) + english_file_path = os.path.join(english_folder, relative_file_name) + output_file_path = os.path.join(output_folder, relative_file_name) + output_file_folder = os.path.dirname(output_file_path) + + if not os.path.exists(output_file_folder): + os.makedirs(output_file_folder) + + if fname == GeneralConstants.FILE_NAME_META_YML and os.path.isfile(english_file_path): + restored_content, suggested_eol = restore_meta_file(source_file_path, english_file_path) + save_file(output_file_path, restored_content, suggested_eol) + elif os.path.splitext(fname)[1] == ".md" and os.path.isfile(english_file_path): + restored_content, suggested_eol = restore_md_file( + source_file_path, + english_file_path, + relative_file_name) + save_file(output_file_path, restored_content, suggested_eol) + elif os.path.abspath(source_file_path) != os.path.abspath(output_file_path): + shutil.copyfile(source_file_path, output_file_path) + + +def restore_md_file(source_file_path, english_file_path, relative_file_name): + content, suggested_eol = get_file(source_file_path) + english_content, _ = get_file(english_file_path) + restored_content = restore_md(content, english_content, relative_file_name, source_file_path) + return restored_content, suggested_eol + + +def restore_md(content, english_content, relative_file_name, md_file_name): + token_map = build_token_map(relative_file_name, english_content) + if "NTTT:" not in content: + return content + + placeholders_in_content = set(match.group() for match in PLACEHOLDER_PATTERN.finditer(content)) + missing_placeholders = sorted(set(token_map) - placeholders_in_content) + unknown_placeholders = sorted(placeholders_in_content - set(token_map)) + + if missing_placeholders: + print("Warning ({}): Missing NTTT placeholders: {}".format( + md_file_name, + ", ".join(missing_placeholders)), file=sys.stderr) + + if unknown_placeholders: + print("Warning ({}): Unknown NTTT placeholders: {}".format( + md_file_name, + ", ".join(unknown_placeholders)), file=sys.stderr) + + restored_content = content + for placeholder in token_map: + restored_content = restored_content.replace(placeholder, token_map[placeholder]["value"]) + + return restored_content + + +def restore_meta_file(source_file_path, english_file_path): + content, suggested_eol = get_file(source_file_path) + english_content, _ = get_file(english_file_path) + return restore_meta_yaml(content, english_content), suggested_eol + + +def restore_meta_yaml(content, english_content): + yaml_parser = yaml_for_round_trip() + parsed_md = yaml_parser.load(content) + english_parsed_md = yaml_parser.load(english_content) + + if parsed_md is None: + return english_content + + if english_parsed_md is None: + return content + + for key in TRANSLATABLE_META_KEYS: + if key in parsed_md: + english_parsed_md[key] = parsed_md[key] + + string_buffer = io.StringIO() + yaml_parser.dump(english_parsed_md, string_buffer) + return string_buffer.getvalue() diff --git a/nttt/strip.py b/nttt/strip.py new file mode 100644 index 0000000..198b0e4 --- /dev/null +++ b/nttt/strip.py @@ -0,0 +1,203 @@ +import hashlib +import io +import json +import os +import re +import shutil +import ruamel.yaml +from .cleanup_alerts import ALERT_LINE_PATTERN, normalise_prefix +from .cleanup_codeblocks import FENCE_PATTERN +from .constants import GeneralConstants, RegexConstants +from .utilities import get_file, save_file + + +TRANSLATABLE_META_KEYS = ["title", "description", "steps", "meta_title", "meta_description"] +SECTION_PATTERN = re.compile(r'^(?P\s*)--- (?P.+?) ---(?P\s*)$') +KRAMDOWN_CLASS_PATTERN = re.compile(rf'\{{:\s*class\s*=\s*[{RegexConstants.QUOTES}].+?[{RegexConstants.QUOTES}]\s*\}}') +HERO_IMAGE_PATTERN = re.compile(r'^(?P\s*)hero_image:\s+images/.+$') +PLACEHOLDER_PATTERN = re.compile(r'') +YAML_PLACEHOLDER_PATTERN = re.compile(r'__NTTT_(?P[a-f0-9]{6}_\d{3})__') + + +def strip_tree(input_folder, output_folder, debug_sidecars=False): + for dname, _, files in os.walk(input_folder): + for fname in files: + source_file_path = os.path.join(dname, fname) + relative_file_name = os.path.relpath(source_file_path, input_folder) + output_file_path = os.path.join(output_folder, relative_file_name) + output_file_folder = os.path.dirname(output_file_path) + + if not os.path.exists(output_file_folder): + os.makedirs(output_file_folder) + + if fname == GeneralConstants.FILE_NAME_META_YML: + stripped_content, token_map, suggested_eol = strip_meta_file(source_file_path, relative_file_name) + save_file(output_file_path, stripped_content, suggested_eol) + elif os.path.splitext(fname)[1] == ".md": + stripped_content, token_map, suggested_eol = strip_md_file(source_file_path, relative_file_name) + save_file(output_file_path, stripped_content, suggested_eol) + else: + shutil.copyfile(source_file_path, output_file_path) + token_map = {} + + if debug_sidecars and token_map: + write_debug_sidecar(output_file_path, source_file_path, token_map) + + +def strip_md_file(source_file_path, relative_file_name): + content, suggested_eol = get_file(source_file_path) + stripped_content, token_map = strip_md(content, relative_file_name) + return stripped_content, token_map, suggested_eol + + +def strip_meta_file(source_file_path, relative_file_name): + content, suggested_eol = get_file(source_file_path) + stripped_content, token_map = strip_meta_yaml(content, relative_file_name) + return stripped_content, token_map, suggested_eol + + +def strip_md(content, relative_file_name): + generator = TokenGenerator(relative_file_name) + token_map = {} + stripped_lines = [] + in_fence = False + + for line in content.split('\n'): + fence_match = FENCE_PATTERN.match(line) + if fence_match: + if not in_fence: + info = fence_match.group("info").strip() + if info: + placeholder = generator.next_markdown_placeholder() + token_map[placeholder] = {"kind": "code_fence_info", "value": info} + line = "{}```{}".format(fence_match.group("indent"), placeholder) + in_fence = True + else: + in_fence = False + + stripped_lines.append(line) + continue + + if in_fence: + stripped_lines.append(line) + continue + + section_match = SECTION_PATTERN.match(line) + if section_match: + placeholder = generator.next_markdown_placeholder() + token_map[placeholder] = {"kind": "section", "value": line} + stripped_lines.append("{}{}{}".format( + section_match.group("indent"), + placeholder, + section_match.group("trailing"))) + continue + + hero_match = HERO_IMAGE_PATTERN.match(line) + if hero_match: + placeholder = generator.next_markdown_placeholder() + token_map[placeholder] = {"kind": "hero_image", "value": line} + stripped_lines.append("{}{}".format(hero_match.group("indent"), placeholder)) + continue + + line = strip_alert_line(line, generator, token_map) + line = strip_kramdown_classes(line, generator, token_map) + stripped_lines.append(line) + + return '\n'.join(stripped_lines), token_map + + +def strip_alert_line(line, generator, token_map): + match = ALERT_LINE_PATTERN.match(line) + if match is None: + return line + + placeholder = generator.next_markdown_placeholder() + token_map[placeholder] = { + "kind": "alert_type", + "value": "[!{}]".format(match.group("alert_type").strip().upper()), + } + prefix = normalise_prefix(match.group("prefix")) + return "{}{}{}{}".format(match.group("indent"), prefix, placeholder, match.group("rest")) + + +def strip_kramdown_classes(line, generator, token_map): + def replace(match): + placeholder = generator.next_markdown_placeholder() + token_map[placeholder] = {"kind": "kramdown_class", "value": match.group()} + return placeholder + + return KRAMDOWN_CLASS_PATTERN.sub(replace, line) + + +def strip_meta_yaml(content, relative_file_name): + yaml_parser = yaml_for_round_trip() + parsed_md = yaml_parser.load(content) + if parsed_md is None: + return content, {} + + stripped_md = type(parsed_md)() + for key in parsed_md: + if key in TRANSLATABLE_META_KEYS: + stripped_md[key] = parsed_md[key] + + string_buffer = io.StringIO() + yaml_parser.dump(stripped_md, string_buffer) + return string_buffer.getvalue(), {} + + +def build_token_map(relative_file_name, content): + _, token_map = strip_md(content, relative_file_name) + return token_map + + +def write_debug_sidecar(output_file_path, source_file_path, token_map): + source_content, _ = get_file(source_file_path) + sidecar = { + "version": 1, + "source_sha256": hashlib.sha256(source_content.encode("utf-8")).hexdigest(), + "tokens": normalise_token_map_for_json(token_map), + } + with open(output_file_path + ".nttt.json", encoding="utf-8", mode="w") as f: + json.dump(sidecar, f, indent=2, ensure_ascii=False) + + +def normalise_token_map_for_json(token_map): + tokens = {} + for placeholder in token_map: + token = placeholder_to_token(placeholder) + tokens[token[-3:]] = token_map[placeholder] + return tokens + + +def placeholder_to_token(placeholder): + match = PLACEHOLDER_PATTERN.search(placeholder) + if match: + return match.group("token") + match = YAML_PLACEHOLDER_PATTERN.search(placeholder) + if match: + return match.group("token").replace("_", "-") + return placeholder + + +def yaml_for_round_trip(): + yaml_parser = ruamel.yaml.YAML(typ='rt') + yaml_parser.preserve_quotes = True + yaml_parser.constructor.yaml_constructors.pop(u'tag:yaml.org,2002:timestamp', None) + yaml_parser.indent(sequence=4, offset=2) + yaml_parser.explicit_start = True + yaml_parser.width = 1000000 + return yaml_parser + + +class TokenGenerator: + def __init__(self, relative_file_name): + self.salt = hashlib.sha256(relative_file_name.encode("utf-8")).hexdigest()[:6] + self.index = 0 + + def next_markdown_placeholder(self): + self.index += 1 + return "".format(self.salt, self.index) + + def next_yaml_placeholder(self): + self.index += 1 + return "__NTTT_{}_{:03d}__".format(self.salt, self.index) diff --git a/nttt/tidyup.py b/nttt/tidyup.py index 433c569..b500214 100644 --- a/nttt/tidyup.py +++ b/nttt/tidyup.py @@ -10,6 +10,9 @@ from .cleanup_formatting import trim_formatting_tags from .cleanup_sections import fix_sections from .cleanup_sections import revert_section_translation +from .cleanup_alerts import fix_alerts +from .cleanup_alerts import revert_alert_translation +from .cleanup_codeblocks import fix_codeblocks def fix_meta(src, english_src, dst): @@ -63,12 +66,20 @@ def fix_md_step(src, lang, english_src, dst, disable, logging): if en_md_content is not None and "revert_section_translation" not in disable: md_content = revert_section_translation(src, md_content, en_md_content, logging) + if "fix_alerts" not in disable: + md_content = fix_alerts(md_content, logging) + if en_md_content is not None and "revert_alert_translation" not in disable: + md_content = revert_alert_translation(src, md_content, en_md_content, logging) + if "fix_md" not in disable: md_content = trim_md_tags(md_content, logging) if "fix_html" not in disable: md_content = trim_html_tags(md_content, logging) + if "fix_codeblocks" not in disable: + md_content = fix_codeblocks(md_content, en_md_content, logging) + if "fix_formatting" not in disable: md_content = trim_formatting_tags(md_content, logging) diff --git a/unit_test/test_cleanup_alerts.py b/unit_test/test_cleanup_alerts.py new file mode 100644 index 0000000..905af50 --- /dev/null +++ b/unit_test/test_cleanup_alerts.py @@ -0,0 +1,39 @@ +import unittest +from nttt import cleanup_alerts + + +class TestCleanupAlerts(unittest.TestCase): + logging = "off" + + def test_fix_alert_spacing_and_case(self): + c_initial = ">[ ! task ] Complete this step." + c_target = "> [!TASK] Complete this step." + + self.assertEqual(cleanup_alerts.fix_alerts(c_initial, self.logging), c_target) + + def test_fix_nested_alert(self): + c_initial = "> >[!hint]\n> >\n> > Try this." + c_target = "> > [!HINT]\n> >\n> > Try this." + + self.assertEqual(cleanup_alerts.fix_alerts(c_initial, self.logging), c_target) + + def test_revert_alert_translation_preserves_title(self): + c_initial = "> [!TAAK] Uitdaging: Verbeter je drum" + c_english = "> [!CHALLENGE] Challenge: Improving your drum" + c_target = "> [!CHALLENGE] Uitdaging: Verbeter je drum" + + self.assertEqual( + cleanup_alerts.revert_alert_translation("step_1.md", c_initial, c_english, self.logging), + c_target) + + def test_revert_alert_translation_skips_when_structure_differs(self): + c_initial = "> [!TAAK]\n\n> [!HINT]" + c_english = "> [!TASK]" + + self.assertEqual( + cleanup_alerts.revert_alert_translation("step_1.md", c_initial, c_english, self.logging), + c_initial) + + +if __name__ == "__main__": + unittest.main() diff --git a/unit_test/test_cleanup_codeblocks.py b/unit_test/test_cleanup_codeblocks.py new file mode 100644 index 0000000..f1c570b --- /dev/null +++ b/unit_test/test_cleanup_codeblocks.py @@ -0,0 +1,40 @@ +import unittest +from nttt import cleanup_codeblocks + + +class TestCleanupCodeblocks(unittest.TestCase): + logging = "off" + + def test_fix_codeblock_info_string(self): + c_initial = ('```Python filename = "Button_Press.py" line_numbers = "TRUE" line_highlights = "3, 5-6"\n' + 'print("Hello")\n' + '```') + c_target = ('```python filename="button_press.py" line_numbers="true" line_highlights="3,5-6"\n' + 'print("Hello")\n' + '```') + + self.assertEqual(cleanup_codeblocks.fix_codeblocks(c_initial, None, self.logging), c_target) + + def test_fix_codeblock_uses_english_language_for_unknown_translated_language(self): + c_initial = ('```pythone filename="button.py"\n' + 'print("Hello")\n' + '```') + c_english = ('```python filename="button.py"\n' + 'print("Hello")\n' + '```') + c_target = ('```python filename="button.py"\n' + 'print("Hello")\n' + '```') + + self.assertEqual(cleanup_codeblocks.fix_codeblocks(c_initial, c_english, self.logging), c_target) + + def test_does_not_change_plain_fences(self): + c_initial = ('```\n' + 'filename = "Button_Press.py"\n' + '```') + + self.assertEqual(cleanup_codeblocks.fix_codeblocks(c_initial, None, self.logging), c_initial) + + +if __name__ == "__main__": + unittest.main() diff --git a/unit_test/test_strip_restore.py b/unit_test/test_strip_restore.py new file mode 100644 index 0000000..d522465 --- /dev/null +++ b/unit_test/test_strip_restore.py @@ -0,0 +1,106 @@ +import unittest +from nttt import restore +from nttt import strip + + +class TestStripRestore(unittest.TestCase): + def test_strip_md_replaces_non_translatable_markers(self): + c_initial = ('--- task ---\n' + '\n' + 'Complete this step.\n' + '\n' + '> [!HINT]\n' + '>\n' + '> Try this.\n' + '\n' + '```python filename="button.py"\n' + 'print("Hello")\n' + '```\n' + '\n' + 'The `Looks`{:class="block3looks"} category.\n' + '\n' + '--- /task ---') + + stripped, token_map = strip.strip_md(c_initial, "step_1.md") + + self.assertIn("\nTranslated text." + c_english = "English text." + + self.assertEqual(restore.restore_md(c_initial, c_english, "step_1.md", "step_1.md"), c_initial) + + def test_strip_restore_roundtrip_is_identity_for_english(self): + c_english = ('> [!TASK]\n' + '>\n' + '> Complete this step.\n' + '\n' + '```python filename="button.py"\n' + 'print("Hello")\n' + '```') + stripped, _ = strip.strip_md(c_english, "step_1.md") + + self.assertEqual(restore.restore_md(stripped, c_english, "step_1.md", "step_1.md"), c_english) + + def test_strip_meta_yaml_removes_non_translatable_keys(self): + c_initial = ('---\n' + 'title: Test project\n' + 'hero_image: images/banner.png\n' + 'description: A project\n') + + stripped, _ = strip.strip_meta_yaml(c_initial, "meta.yml") + + self.assertIn("title: Test project", stripped) + self.assertIn("description: A project", stripped) + self.assertNotIn("hero_image", stripped) + + def test_restore_meta_yaml_merges_translated_keys_into_english(self): + c_english = ('---\n' + 'title: Test project\n' + 'hero_image: images/banner.png\n' + 'description: A project\n') + c_translated = ('---\n' + 'title: Testproject\n' + 'description: Een project\n') + + restored = restore.restore_meta_yaml(c_translated, c_english) + + self.assertIn("title: Testproject", restored) + self.assertIn("description: Een project", restored) + self.assertIn("hero_image: images/banner.png", restored) + + +if __name__ == "__main__": + unittest.main() From 2bbe3311d31b9171d01e7fd42e64d2cefd6bc503 Mon Sep 17 00:00:00 2001 From: Jo Humphrey <31373245+jamdelion@users.noreply.github.com> Date: Tue, 5 May 2026 12:13:35 +0100 Subject: [PATCH 3/7] Update readme with pipx instructions --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 09c374b..e7a6bf8 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,14 @@ pip3 install . --upgrade ![install nttt](images/install_nttt.png) +You could also use `pipx` (instructions below for Mac using homebrew): + +```bash +brew install pipx +pipx install /path/to/project/nttt +nttt --help +``` + You can uninstall nttt using: ```bash From 4f01a1a8fa4dd468b11cb91961a2a0f01f6ddfb1 Mon Sep 17 00:00:00 2001 From: Jo Humphrey <31373245+jamdelion@users.noreply.github.com> Date: Tue, 5 May 2026 12:24:56 +0100 Subject: [PATCH 4/7] Revert "Get AI to implement everything" This reverts commit e95e39310e1f7ed3c99ff5c0156e5cb203771ee8. --- README.md | 29 ---- doc/transformations.md | 78 ++-------- nttt/__init__.py | 59 -------- nttt/arguments.py | 83 ++++------- nttt/cleanup_alerts.py | 100 ------------- nttt/cleanup_codeblocks.py | 130 ----------------- nttt/restore.py | 94 ------------- nttt/strip.py | 203 --------------------------- nttt/tidyup.py | 11 -- unit_test/test_cleanup_alerts.py | 39 ----- unit_test/test_cleanup_codeblocks.py | 40 ------ unit_test/test_strip_restore.py | 106 -------------- 12 files changed, 35 insertions(+), 937 deletions(-) delete mode 100644 nttt/cleanup_alerts.py delete mode 100644 nttt/cleanup_codeblocks.py delete mode 100644 nttt/restore.py delete mode 100644 nttt/strip.py delete mode 100644 unit_test/test_cleanup_alerts.py delete mode 100644 unit_test/test_cleanup_codeblocks.py delete mode 100644 unit_test/test_strip_restore.py diff --git a/README.md b/README.md index e7a6bf8..717760d 100644 --- a/README.md +++ b/README.md @@ -114,23 +114,6 @@ You can specify different directories for the input and output folder using the nttt --input c:\path\to\project\de-DE --output c:\path\to\project\de-DE-tidy ``` -### Crowdin strip and restore - -NTTT can also prepare English source for Crowdin and restore non-translatable markers after translated content is downloaded. - -```bash -# Before upload: hide non-translatable markers from Crowdin. -nttt strip --input en --output .crowdin-staging/en - -# After download: restore markers into a translated locale. -nttt restore --input fr-FR --english en --output fr-FR - -# Restore and then run the usual tidy-up in one command. -nttt restore --input fr-FR --english en --output fr-FR --then-tidyup --Yes on -``` - -`strip` replaces legacy section markers, modern alert type tokens, codeblock info strings, inline kramdown class metadata, and non-translatable `meta.yml` keys with deterministic placeholders. `restore` regenerates the placeholder map from `en/`, so no sidecar files need to be persisted between workflows. See [doc/transformations.md](doc/transformations.md) for details. - ### Help To bring up full usage information use the `-h`/`--help` option. @@ -139,8 +122,6 @@ To bring up full usage information use the `-h`/`--help` option. nttt -h usage: nttt [-h] [-i INPUT] [-o OUTPUT] [-e ENGLISH] [-l LANGUAGE] [-v VOLUNTEERS] [-f FINAL] - [-D DISABLE] [-L LOGGING] [-Y YES] - {strip,restore} ... Nina's Translation Tidyup Tool @@ -165,16 +146,6 @@ optional arguments: -f FINAL, --final FINAL The number of the final step file, defaults to the step file with the highest number. - -D DISABLE, --Disable DISABLE - The risky features to be disabled, separated by commas. - Options include fix_md, fix_html, fix_sections, - revert_section_translation, fix_alerts, - revert_alert_translation, fix_codeblocks, and - fix_formatting. - -subcommands: - strip Strip non-translatable markers for Crowdin upload. - restore Restore non-translatable markers after Crowdin download. examples of usage: nttt diff --git a/doc/transformations.md b/doc/transformations.md index bbdfd3b..733a174 100644 --- a/doc/transformations.md +++ b/doc/transformations.md @@ -17,14 +17,11 @@ NTTT does **not** process standalone `.html` files. HTML-related steps run on ** For each `.md` file, [`nttt/tidyup.py`](../nttt/tidyup.py) applies, in order: 1. **`fix_sections`** — normalise `---` section lines (Crowdin quirks). -2. **`fix_alerts`** — normalise modern `> [!TYPE]` alert markers. -3. **`revert_section_translation`** — optional; restore English section tag lines when structure matches. -4. **`revert_alert_translation`** — optional; restore English alert type tokens when structure matches. -5. **`trim_md_tags`** — strip padding inside paired Markdown delimiters (outside ` ``` ` fences). -6. **`trim_html_tags`** — strip padding inside simple inline HTML tags (outside single `` ` `` spans). -7. **`fix_codeblocks`** — normalise modern fenced-code info strings. -8. **`trim_formatting_tags`** — normalise `{ … }` attribute blocks after a word (Scratch/Pico-style). -9. **URL rewrite:** replace `/en/` with `//` everywhere in the file body. +2. **`revert_section_translation`** — optional; restore English section tag lines when structure matches. +3. **`trim_md_tags`** — strip padding inside paired Markdown delimiters (outside ` ``` ` fences). +4. **`trim_html_tags`** — strip padding inside simple inline HTML tags (outside single `` ` `` spans). +5. **`trim_formatting_tags`** — normalise `{ … }` attribute blocks after a word (Scratch/Pico-style). +6. **URL rewrite:** replace `/en/` with `//` everywhere in the file body. Steps 1–5 can be skipped via **`--disable`** (see [`nttt/arguments.py`](../nttt/arguments.py)). @@ -52,23 +49,7 @@ Steps 1–5 can be skipped via **`--disable`** (see [`nttt/arguments.py`](../ntt --- -## 2. Modern alerts (`nttt/cleanup_alerts.py`) - -**Function:** `fix_alerts` - -- Normalises modern alert markers such as `> [!TASK]`, `> [!HINT]`, `> [!ACCORDION] Title`, and nested markers such as `> > [!HINT]`. -- Fixes spacing and casing around the marker: `>[ ! task ]` → `> [!TASK]`. -- Preserves any title text after the marker because it is translatable. - -**Function:** `revert_alert_translation` (requires English `.md`) - -- Collects alert marker lines in translation and English. -- If **counts and nesting depth match**, replaces only the translated alert type token with the English token. -- If structure differs, logs a warning and leaves alert types unchanged for this step. - ---- - -## 3. Markdown delimiters (`nttt/cleanup_markdown.py`) +## 2. Markdown delimiters (`nttt/cleanup_markdown.py`) **Function:** `trim_md_tags` @@ -82,7 +63,7 @@ Logging can record each replacement (`log_replacement`). --- -## 4. Inline HTML (`nttt/cleanup_html.py`) +## 3. Inline HTML (`nttt/cleanup_html.py`) **Function:** `trim_html_tags` @@ -92,20 +73,7 @@ Logging can record each replacement (`log_replacement`). --- -## 5. Codeblock info strings (`nttt/cleanup_codeblocks.py`) - -**Function:** `fix_codeblocks` - -- Normalises opening fenced-code lines such as ```` ```python filename="button.py" ````. -- Lowercases the language token and attribute keys/values. -- Normalises quotes and spacing around `=`. -- Collapses spaces inside `line_highlights`, e.g. `"3, 5-6"` → `"3,5-6"`. -- If the translated language token is not recognised and the English file is available, restores the English language token at the same fence index. -- Does **not** change code inside the block or closing fences. - ---- - -## 6. Formatting braces (`nttt/cleanup_formatting.py`) +## 4. Formatting braces (`nttt/cleanup_formatting.py`) **Function:** `trim_formatting_tags` @@ -116,7 +84,7 @@ Logging can record each replacement (`log_replacement`). --- -## 7. Locale URLs (`nttt/tidyup.py`) +## 5. Locale URLs (`nttt/tidyup.py`) After cleanup: **replace every `/en/` with `//`** in the Markdown file (`language` from resolved CLI args, defaulting from input folder basename). @@ -130,31 +98,6 @@ After cleanup: **replace every `/en/` with `//`** in the Markdown file --- -## Strip / restore workflow - -`nttt strip` prepares English source for Crowdin by replacing non-translatable markers with deterministic placeholders. `nttt restore` regenerates the same placeholder map from `en/` and re-injects the markers into translated files after Crowdin download. - -Typical workflow: - -1. Upload side: `nttt strip -i en/ -o .crowdin-staging/en/` -2. Crowdin translates `.crowdin-staging/en/` -3. Download side: `nttt restore -i fr-FR/ -e en/ -o fr-FR/` -4. Existing tidy-up: `nttt -i fr-FR/ -Y YES` - -Markdown placeholders use HTML comments such as ``. Configure Crowdin to treat `` as non-translatable. - -`strip` currently hides: - -- legacy section marker lines such as `--- task ---` and `--- /task ---` -- modern alert type tokens such as `[!TASK]` -- modern fenced-code info strings such as `python filename="button.py"` -- inline kramdown class metadata such as `{:class="block3looks"}` -- non-translatable `meta.yml` keys by dropping anything outside `title`, `description`, `steps`, `meta_title`, and `meta_description` - -`restore` is safe to run on older Crowdin downloads that do not contain placeholders; it is a no-op for those files. It warns, but does not fail, if placeholders are missing or unknown. - ---- - ## Quick code map | Concern | Module | @@ -162,10 +105,7 @@ Markdown placeholders use HTML comments such as ``. Conf | Orchestration | `nttt/tidyup.py`, `nttt/__init__.py` | | CLI / disable flags | `nttt/arguments.py` | | Sections | `nttt/cleanup_sections.py` | -| Modern alerts | `nttt/cleanup_alerts.py` | | Markdown emphasis / code delimiters | `nttt/cleanup_markdown.py` | | Inline HTML | `nttt/cleanup_html.py` | -| Codeblock info strings | `nttt/cleanup_codeblocks.py` | | Brace attributes | `nttt/cleanup_formatting.py` | -| Strip / restore | `nttt/strip.py`, `nttt/restore.py` | | Split "every other segment" | `nttt/utilities.py` → `apply_to_every_other_part` | diff --git a/nttt/__init__.py b/nttt/__init__.py index 348532b..9976cfb 100644 --- a/nttt/__init__.py +++ b/nttt/__init__.py @@ -1,69 +1,10 @@ -import os from .arguments import parse_command_line, resolve_arguments, check_arguments, show_arguments -from .arguments import get_absolute_path, get_final_step -from .constants import ArgumentKeyConstants -from .restore import restore_tree -from .strip import strip_tree from .tidyup import tidyup_translations from ._version import __version__ - def main(): command_line_args = parse_command_line(__version__) - command = getattr(command_line_args, "command", None) - - if command == "strip": - strip_tree( - get_absolute_path(command_line_args.input), - get_absolute_path(command_line_args.output), - command_line_args.debug_sidecars) - return - - if command == "restore": - restore_tree( - get_absolute_path(command_line_args.input), - get_absolute_path(command_line_args.english), - get_absolute_path(command_line_args.output)) - - if command_line_args.then_tidyup: - restored_arguments = build_restore_tidyup_arguments(command_line_args) - show_arguments(restored_arguments) - if check_arguments(restored_arguments): - tidyup_translations(restored_arguments) - return - resolved_arguments = resolve_arguments(command_line_args) show_arguments(resolved_arguments) if (check_arguments(resolved_arguments)): tidyup_translations(resolved_arguments) - - -def build_restore_tidyup_arguments(command_line_args): - input_folder = get_absolute_path(command_line_args.input) - output_folder = get_absolute_path(command_line_args.output) - english_folder = get_absolute_path(command_line_args.english) - - arguments = {} - arguments[ArgumentKeyConstants.INPUT] = output_folder - arguments[ArgumentKeyConstants.OUTPUT] = output_folder - arguments[ArgumentKeyConstants.ENGLISH] = english_folder - arguments[ArgumentKeyConstants.LANGUAGE] = os.path.basename(input_folder) - arguments[ArgumentKeyConstants.VOLUNTEERS] = [] - arguments[ArgumentKeyConstants.FINAL] = get_final_step(output_folder) - - if getattr(command_line_args, "Disable", False): - arguments[ArgumentKeyConstants.DISABLE] = command_line_args.Disable.split(",") - else: - arguments[ArgumentKeyConstants.DISABLE] = [] - - if getattr(command_line_args, "Logging", False): - arguments[ArgumentKeyConstants.LOGGING] = command_line_args.Logging - else: - arguments[ArgumentKeyConstants.LOGGING] = "off" - - if getattr(command_line_args, "Yes", False): - arguments[ArgumentKeyConstants.YES] = command_line_args.Yes - else: - arguments[ArgumentKeyConstants.YES] = "off" - - return arguments diff --git a/nttt/arguments.py b/nttt/arguments.py index 9ea8138..35f76b0 100644 --- a/nttt/arguments.py +++ b/nttt/arguments.py @@ -4,45 +4,6 @@ from argparse import ArgumentParser -def add_tidyup_arguments(parser): - parser.add_argument("-i", "--input", help="The input directory which contains the content to tidy up, defaults to the current directory.") - parser.add_argument("-o", "--output", help="The output directory where the upgraded content should be written, defaults to the same as INPUT.") - parser.add_argument("-e", "--english", help="The directory which contains the English files and folders, defaults to INPUT/../en.") - parser.add_argument("-l", "--language", help="The language of the content to be tidied up, defaults to basename(INPUT).") - parser.add_argument("-v", "--volunteers", help="The list of volunteers as a comma separated list, defaults to an empty list.") - parser.add_argument("-f", "--final", help="The number of the final step file, defaults to the step file with the highest number.") - parser.add_argument("-D", "--Disable", help="The risky features to be disabled, separated by commas. " - "Options are: fix_md (fix common markdown-related issues), " - "fix_html (fix common issues in HTML-like tags (Return)), " - "fix_sections (fix common issues in section tags (--- hint ---)), " - "revert_section_translation (revert translation for section tags), " - "fix_alerts (fix common issues in modern alert tags (> [!TASK])), " - "revert_alert_translation (revert translation for modern alert types), " - "fix_codeblocks (fix common issues in fenced codeblock info strings), " - "fix_formatting (fix common issues in formatting tags ({:class=\"block3motion\"})). " - "Defaults to all risky features to be enabled.") - parser.add_argument("-L", "--Logging", help="Logging of modifications. Options are on and off. Default is off.") - parser.add_argument("-Y", "--Yes", help="Automatic yes to prompts. " - "If enabled assume 'yes' as answer to all prompts and run non-interactively. " - "Options are on and off. Default is off.") - - -def add_strip_arguments(parser): - parser.add_argument("-i", "--input", required=True, help="The English source directory to strip.") - parser.add_argument("-o", "--output", required=True, help="The output directory where the Crowdin-ready copy should be written.") - parser.add_argument("--debug-sidecars", action="store_true", help="Write optional .nttt.json sidecars for inspection.") - - -def add_restore_arguments(parser): - parser.add_argument("-i", "--input", required=True, help="The translated directory to restore.") - parser.add_argument("-e", "--english", required=True, help="The English source directory used to regenerate placeholders.") - parser.add_argument("-o", "--output", required=True, help="The output directory where restored files should be written.") - parser.add_argument("--then-tidyup", action="store_true", help="Run the tidy-up pipeline after restoring.") - parser.add_argument("-D", "--Disable", help="The tidy-up features to disable when --then-tidyup is used, separated by commas.") - parser.add_argument("-L", "--Logging", help="Logging of modifications. Options are on and off. Default is off.") - parser.add_argument("-Y", "--Yes", help="Automatic yes to prompts when --then-tidyup is used. Options are on and off. Default is off.") - - def get_absolute_path(folder): ''' Returns the absolute path for the given folder. Trailing path separators @@ -84,15 +45,23 @@ def parse_command_line(version): """ parser = ArgumentParser(description="Nina's Translation Tidyup Tool v{}".format(version)) - add_tidyup_arguments(parser) - subparsers = parser.add_subparsers(dest="command") - - strip_parser = subparsers.add_parser("strip", help="Strip non-translatable markers for Crowdin upload.") - add_strip_arguments(strip_parser) - - restore_parser = subparsers.add_parser("restore", help="Restore non-translatable markers after Crowdin download.") - add_restore_arguments(restore_parser) - + parser.add_argument("-i", "--input", help="The input directory which contains the content to tidy up, defaults to the current directory.") + parser.add_argument("-o", "--output", help="The output directory where the upgraded content should be written, defaults to the same as INPUT.") + parser.add_argument("-e", "--english", help="The directory which contains the English files and folders, defaults to INPUT/../en.") + parser.add_argument("-l", "--language", help="The language of the content to be tidied up, defaults to basename(INPUT).") + parser.add_argument("-v", "--volunteers", help="The list of volunteers as a comma separated list, defaults to an empty list.") + parser.add_argument("-f", "--final", help="The number of the final step file, defaults to the step file with the highest number.") + parser.add_argument("-D", "--Disable", help="The risky features to be disabled, separated by commas. " + "Options are: fix_md (fix common markdown-related issues), " + "fix_html (fix common issues in HTML-like tags (Return)), " + "fix_sections (fix common issues in section tags (--- hint ---)), " + "revert_section_translation (revert translation for section tags), " + "fix_formatting (fix common issues in formatting tags ({:class=\"block3motion\"})). " + "Defaults to all risky features to be enabled.") + parser.add_argument("-L", "--Logging", help="Logging of modifications. Options are on and off. Default is off.") + parser.add_argument("-Y", "--Yes", help="Automatic yes to prompts. " + "If enabled assume 'yes' as answer to all prompts and run non-interactively. " + "Options are on and off. Default is off.") return parser.parse_args() @@ -106,47 +75,47 @@ def resolve_arguments(command_line_args): arguments = {} - if getattr(command_line_args, "input", False): + if command_line_args.input: arguments[ArgumentKeyConstants.INPUT] = get_absolute_path(command_line_args.input) else: arguments[ArgumentKeyConstants.INPUT] = get_absolute_path('.') - if getattr(command_line_args, "output", False): + if command_line_args.output: arguments[ArgumentKeyConstants.OUTPUT] = get_absolute_path(command_line_args.output) else: arguments[ArgumentKeyConstants.OUTPUT] = arguments[ArgumentKeyConstants.INPUT] - if getattr(command_line_args, "english", False): + if command_line_args.english: arguments[ArgumentKeyConstants.ENGLISH] = get_absolute_path(command_line_args.english) else: arguments[ArgumentKeyConstants.ENGLISH] = Path(dirname(arguments[ArgumentKeyConstants.INPUT]), 'en') - if getattr(command_line_args, "language", False): + if command_line_args.language: arguments[ArgumentKeyConstants.LANGUAGE] = command_line_args.language else: arguments[ArgumentKeyConstants.LANGUAGE] = basename(arguments[ArgumentKeyConstants.INPUT]) - if getattr(command_line_args, "volunteers", False): + if command_line_args.volunteers: arguments[ArgumentKeyConstants.VOLUNTEERS] = [name.strip() for name in command_line_args.volunteers.split(',')] else: arguments[ArgumentKeyConstants.VOLUNTEERS] = [] - if getattr(command_line_args, "final", False): + if command_line_args.final: arguments[ArgumentKeyConstants.FINAL] = int(command_line_args.final) else: arguments[ArgumentKeyConstants.FINAL] = get_final_step(arguments[ArgumentKeyConstants.INPUT]) - if getattr(command_line_args, "Disable", False): + if command_line_args.Disable: arguments[ArgumentKeyConstants.DISABLE] = command_line_args.Disable.split(",") else: arguments[ArgumentKeyConstants.DISABLE] = [] - if getattr(command_line_args, "Logging", False): + if command_line_args.Logging: arguments[ArgumentKeyConstants.LOGGING] = command_line_args.Logging else: arguments[ArgumentKeyConstants.LOGGING] = "off" - if getattr(command_line_args, "Yes", False): + if command_line_args.Yes: arguments[ArgumentKeyConstants.YES] = command_line_args.Yes else: arguments[ArgumentKeyConstants.YES] = "off" diff --git a/nttt/cleanup_alerts.py b/nttt/cleanup_alerts.py deleted file mode 100644 index 11a39cd..0000000 --- a/nttt/cleanup_alerts.py +++ /dev/null @@ -1,100 +0,0 @@ -import re -import sys -from .constants import RegexConstants -from .nttt_logging import log_replacement - - -ALERT_LINE_PATTERN = re.compile( - rf'^(?P[{RegexConstants.SPACES}]*)' - rf'(?P(?:>[{RegexConstants.SPACES}]*)+)' - rf'[\[[][{RegexConstants.SPACES}]*![{RegexConstants.SPACES}]*' - rf'(?P[^\]]]+?)' - rf'[{RegexConstants.SPACES}]*[\]]]' - rf'(?P.*)$' -) - - -def fix_alerts(md_file_content, logging): - lines = md_file_content.split('\n') - fixed_lines = [] - - for line in lines: - fixed_lines.append(fix_alert_line(line, logging)) - - return '\n'.join(fixed_lines) - - -def fix_alert_line(line, logging): - match = ALERT_LINE_PATTERN.match(line) - if match is None: - return line - - alert_type = match.group("alert_type").strip().upper() - prefix = normalise_prefix(match.group("prefix")) - replacement_text = "{}{}[!{}]{}".format( - match.group("indent"), - prefix, - alert_type, - match.group("rest")) - - log_replacement(line, replacement_text, logging) - return replacement_text - - -def normalise_prefix(prefix): - depth = prefix.count(">") - return "> " * depth - - -def revert_alert_translation(md_file_name, md_file_content, en_file_content, logging): - md_file_lines = md_file_content.split('\n') - md_alerts = extract_alerts(md_file_lines) - - en_file_lines = en_file_content.split('\n') - en_alerts = extract_alerts(en_file_lines) - - if len(md_alerts) == len(en_alerts): - for i in range(len(md_alerts)): - md_alert = md_alerts[i] - en_alert = en_alerts[i] - - if md_alert["depth"] != en_alert["depth"]: - return warn_and_skip(md_file_name, md_file_content) - - replacement_text = "{}{}[!{}]{}".format( - md_alert["indent"], - md_alert["prefix"], - en_alert["alert_type"], - md_alert["rest"]) - log_replacement(md_file_lines[md_alert["line_num"]], replacement_text, logging) - md_file_lines[md_alert["line_num"]] = replacement_text - - return '\n'.join(md_file_lines) - else: - return warn_and_skip(md_file_name, md_file_content) - - -def warn_and_skip(md_file_name, md_file_content): - print("Warning ({}): Different alert structure in the original (en) and the translated pages. " - "Reverting of translated alert types will not be performed".format(md_file_name), file=sys.stderr) - return md_file_content - - -def extract_alerts(md_file_lines): - alerts = [] - - for i in range(len(md_file_lines)): - line = fix_alert_line(md_file_lines[i], "off") - match = ALERT_LINE_PATTERN.match(line) - if match: - prefix = normalise_prefix(match.group("prefix")) - alerts.append({ - "line_num": i, - "indent": match.group("indent"), - "prefix": prefix, - "depth": prefix.count(">"), - "alert_type": match.group("alert_type").strip().upper(), - "rest": match.group("rest"), - }) - - return alerts diff --git a/nttt/cleanup_codeblocks.py b/nttt/cleanup_codeblocks.py deleted file mode 100644 index 9ac69b1..0000000 --- a/nttt/cleanup_codeblocks.py +++ /dev/null @@ -1,130 +0,0 @@ -import re -from .constants import RegexConstants -from .nttt_logging import log_replacement - - -FENCE_PATTERN = re.compile(r'^(?P\s*)```(?P.*)$') -ATTR_PATTERN = re.compile( - rf'(?P[\w-]+)[{RegexConstants.SPACES}]*=[{RegexConstants.SPACES}]*' - rf'(?P[{RegexConstants.QUOTES}])(?P.*?)(?P=quote)' -) - -KNOWN_LANGUAGES = { - "bash", - "c", - "cpp", - "css", - "html", - "javascript", - "js", - "json", - "markdown", - "python", - "scratch3", - "shell", - "text", - "typescript", - "yaml", -} - - -def fix_codeblocks(md_file_content, english_file_content, logging): - english_infos = [] - if english_file_content is not None: - english_infos = extract_opening_fence_infos(english_file_content) - - lines = md_file_content.split('\n') - fixed_lines = [] - in_fence = False - opening_fence_index = 0 - - for line in lines: - match = FENCE_PATTERN.match(line) - if match: - if not in_fence: - english_info = None - if opening_fence_index < len(english_infos): - english_info = english_infos[opening_fence_index] - - fixed_lines.append(fix_opening_fence(line, english_info, logging)) - opening_fence_index += 1 - in_fence = True - else: - fixed_lines.append(line) - in_fence = False - else: - fixed_lines.append(line) - - return '\n'.join(fixed_lines) - - -def extract_opening_fence_infos(md_file_content): - infos = [] - in_fence = False - - for line in md_file_content.split('\n'): - match = FENCE_PATTERN.match(line) - if match: - if not in_fence: - infos.append(match.group("info").strip()) - in_fence = True - else: - in_fence = False - - return infos - - -def fix_opening_fence(line, english_info, logging): - match = FENCE_PATTERN.match(line) - if match is None: - return line - - info = normalise_quotes(match.group("info").strip()) - if info == "": - return line - - fixed_info = normalise_info_string(info, english_info) - replacement_text = "{}```{}".format(match.group("indent"), fixed_info) - log_replacement(line, replacement_text, logging) - return replacement_text - - -def normalise_info_string(info, english_info=None): - attr_matches = list(ATTR_PATTERN.finditer(info)) - attr_start = attr_matches[0].start() if attr_matches else len(info) - lang = info[:attr_start].strip().lower() - - if lang not in KNOWN_LANGUAGES and english_info: - english_lang = extract_language(english_info) - if english_lang: - lang = english_lang - - attrs = [] - for match in attr_matches: - key = match.group("key").lower() - value = match.group("value").strip().lower() - if key == "line_highlights": - value = re.sub(r'\s+', '', value) - attrs.append('{}="{}"'.format(key, value)) - - if len(attrs) == 0: - return lang - - if lang == "": - return " ".join(attrs) - - return "{} {}".format(lang, " ".join(attrs)) - - -def extract_language(info): - info = normalise_quotes(info.strip()) - attr_match = ATTR_PATTERN.search(info) - if attr_match: - return info[:attr_match.start()].strip().lower() - return info.strip().lower() - - -def normalise_quotes(text): - for quote in RegexConstants.QUOTES: - text = text.replace(quote, '"') - return text diff --git a/nttt/restore.py b/nttt/restore.py deleted file mode 100644 index d9714a2..0000000 --- a/nttt/restore.py +++ /dev/null @@ -1,94 +0,0 @@ -import io -import os -import shutil -import sys -from .constants import GeneralConstants -from .strip import PLACEHOLDER_PATTERN, TRANSLATABLE_META_KEYS, build_token_map, yaml_for_round_trip -from .utilities import get_file, save_file - - -def restore_tree(input_folder, english_folder, output_folder): - for dname, _, files in os.walk(input_folder): - for fname in files: - if fname.endswith(".nttt.json"): - continue - - source_file_path = os.path.join(dname, fname) - relative_file_name = os.path.relpath(source_file_path, input_folder) - english_file_path = os.path.join(english_folder, relative_file_name) - output_file_path = os.path.join(output_folder, relative_file_name) - output_file_folder = os.path.dirname(output_file_path) - - if not os.path.exists(output_file_folder): - os.makedirs(output_file_folder) - - if fname == GeneralConstants.FILE_NAME_META_YML and os.path.isfile(english_file_path): - restored_content, suggested_eol = restore_meta_file(source_file_path, english_file_path) - save_file(output_file_path, restored_content, suggested_eol) - elif os.path.splitext(fname)[1] == ".md" and os.path.isfile(english_file_path): - restored_content, suggested_eol = restore_md_file( - source_file_path, - english_file_path, - relative_file_name) - save_file(output_file_path, restored_content, suggested_eol) - elif os.path.abspath(source_file_path) != os.path.abspath(output_file_path): - shutil.copyfile(source_file_path, output_file_path) - - -def restore_md_file(source_file_path, english_file_path, relative_file_name): - content, suggested_eol = get_file(source_file_path) - english_content, _ = get_file(english_file_path) - restored_content = restore_md(content, english_content, relative_file_name, source_file_path) - return restored_content, suggested_eol - - -def restore_md(content, english_content, relative_file_name, md_file_name): - token_map = build_token_map(relative_file_name, english_content) - if "NTTT:" not in content: - return content - - placeholders_in_content = set(match.group() for match in PLACEHOLDER_PATTERN.finditer(content)) - missing_placeholders = sorted(set(token_map) - placeholders_in_content) - unknown_placeholders = sorted(placeholders_in_content - set(token_map)) - - if missing_placeholders: - print("Warning ({}): Missing NTTT placeholders: {}".format( - md_file_name, - ", ".join(missing_placeholders)), file=sys.stderr) - - if unknown_placeholders: - print("Warning ({}): Unknown NTTT placeholders: {}".format( - md_file_name, - ", ".join(unknown_placeholders)), file=sys.stderr) - - restored_content = content - for placeholder in token_map: - restored_content = restored_content.replace(placeholder, token_map[placeholder]["value"]) - - return restored_content - - -def restore_meta_file(source_file_path, english_file_path): - content, suggested_eol = get_file(source_file_path) - english_content, _ = get_file(english_file_path) - return restore_meta_yaml(content, english_content), suggested_eol - - -def restore_meta_yaml(content, english_content): - yaml_parser = yaml_for_round_trip() - parsed_md = yaml_parser.load(content) - english_parsed_md = yaml_parser.load(english_content) - - if parsed_md is None: - return english_content - - if english_parsed_md is None: - return content - - for key in TRANSLATABLE_META_KEYS: - if key in parsed_md: - english_parsed_md[key] = parsed_md[key] - - string_buffer = io.StringIO() - yaml_parser.dump(english_parsed_md, string_buffer) - return string_buffer.getvalue() diff --git a/nttt/strip.py b/nttt/strip.py deleted file mode 100644 index 198b0e4..0000000 --- a/nttt/strip.py +++ /dev/null @@ -1,203 +0,0 @@ -import hashlib -import io -import json -import os -import re -import shutil -import ruamel.yaml -from .cleanup_alerts import ALERT_LINE_PATTERN, normalise_prefix -from .cleanup_codeblocks import FENCE_PATTERN -from .constants import GeneralConstants, RegexConstants -from .utilities import get_file, save_file - - -TRANSLATABLE_META_KEYS = ["title", "description", "steps", "meta_title", "meta_description"] -SECTION_PATTERN = re.compile(r'^(?P\s*)--- (?P.+?) ---(?P\s*)$') -KRAMDOWN_CLASS_PATTERN = re.compile(rf'\{{:\s*class\s*=\s*[{RegexConstants.QUOTES}].+?[{RegexConstants.QUOTES}]\s*\}}') -HERO_IMAGE_PATTERN = re.compile(r'^(?P\s*)hero_image:\s+images/.+$') -PLACEHOLDER_PATTERN = re.compile(r'') -YAML_PLACEHOLDER_PATTERN = re.compile(r'__NTTT_(?P[a-f0-9]{6}_\d{3})__') - - -def strip_tree(input_folder, output_folder, debug_sidecars=False): - for dname, _, files in os.walk(input_folder): - for fname in files: - source_file_path = os.path.join(dname, fname) - relative_file_name = os.path.relpath(source_file_path, input_folder) - output_file_path = os.path.join(output_folder, relative_file_name) - output_file_folder = os.path.dirname(output_file_path) - - if not os.path.exists(output_file_folder): - os.makedirs(output_file_folder) - - if fname == GeneralConstants.FILE_NAME_META_YML: - stripped_content, token_map, suggested_eol = strip_meta_file(source_file_path, relative_file_name) - save_file(output_file_path, stripped_content, suggested_eol) - elif os.path.splitext(fname)[1] == ".md": - stripped_content, token_map, suggested_eol = strip_md_file(source_file_path, relative_file_name) - save_file(output_file_path, stripped_content, suggested_eol) - else: - shutil.copyfile(source_file_path, output_file_path) - token_map = {} - - if debug_sidecars and token_map: - write_debug_sidecar(output_file_path, source_file_path, token_map) - - -def strip_md_file(source_file_path, relative_file_name): - content, suggested_eol = get_file(source_file_path) - stripped_content, token_map = strip_md(content, relative_file_name) - return stripped_content, token_map, suggested_eol - - -def strip_meta_file(source_file_path, relative_file_name): - content, suggested_eol = get_file(source_file_path) - stripped_content, token_map = strip_meta_yaml(content, relative_file_name) - return stripped_content, token_map, suggested_eol - - -def strip_md(content, relative_file_name): - generator = TokenGenerator(relative_file_name) - token_map = {} - stripped_lines = [] - in_fence = False - - for line in content.split('\n'): - fence_match = FENCE_PATTERN.match(line) - if fence_match: - if not in_fence: - info = fence_match.group("info").strip() - if info: - placeholder = generator.next_markdown_placeholder() - token_map[placeholder] = {"kind": "code_fence_info", "value": info} - line = "{}```{}".format(fence_match.group("indent"), placeholder) - in_fence = True - else: - in_fence = False - - stripped_lines.append(line) - continue - - if in_fence: - stripped_lines.append(line) - continue - - section_match = SECTION_PATTERN.match(line) - if section_match: - placeholder = generator.next_markdown_placeholder() - token_map[placeholder] = {"kind": "section", "value": line} - stripped_lines.append("{}{}{}".format( - section_match.group("indent"), - placeholder, - section_match.group("trailing"))) - continue - - hero_match = HERO_IMAGE_PATTERN.match(line) - if hero_match: - placeholder = generator.next_markdown_placeholder() - token_map[placeholder] = {"kind": "hero_image", "value": line} - stripped_lines.append("{}{}".format(hero_match.group("indent"), placeholder)) - continue - - line = strip_alert_line(line, generator, token_map) - line = strip_kramdown_classes(line, generator, token_map) - stripped_lines.append(line) - - return '\n'.join(stripped_lines), token_map - - -def strip_alert_line(line, generator, token_map): - match = ALERT_LINE_PATTERN.match(line) - if match is None: - return line - - placeholder = generator.next_markdown_placeholder() - token_map[placeholder] = { - "kind": "alert_type", - "value": "[!{}]".format(match.group("alert_type").strip().upper()), - } - prefix = normalise_prefix(match.group("prefix")) - return "{}{}{}{}".format(match.group("indent"), prefix, placeholder, match.group("rest")) - - -def strip_kramdown_classes(line, generator, token_map): - def replace(match): - placeholder = generator.next_markdown_placeholder() - token_map[placeholder] = {"kind": "kramdown_class", "value": match.group()} - return placeholder - - return KRAMDOWN_CLASS_PATTERN.sub(replace, line) - - -def strip_meta_yaml(content, relative_file_name): - yaml_parser = yaml_for_round_trip() - parsed_md = yaml_parser.load(content) - if parsed_md is None: - return content, {} - - stripped_md = type(parsed_md)() - for key in parsed_md: - if key in TRANSLATABLE_META_KEYS: - stripped_md[key] = parsed_md[key] - - string_buffer = io.StringIO() - yaml_parser.dump(stripped_md, string_buffer) - return string_buffer.getvalue(), {} - - -def build_token_map(relative_file_name, content): - _, token_map = strip_md(content, relative_file_name) - return token_map - - -def write_debug_sidecar(output_file_path, source_file_path, token_map): - source_content, _ = get_file(source_file_path) - sidecar = { - "version": 1, - "source_sha256": hashlib.sha256(source_content.encode("utf-8")).hexdigest(), - "tokens": normalise_token_map_for_json(token_map), - } - with open(output_file_path + ".nttt.json", encoding="utf-8", mode="w") as f: - json.dump(sidecar, f, indent=2, ensure_ascii=False) - - -def normalise_token_map_for_json(token_map): - tokens = {} - for placeholder in token_map: - token = placeholder_to_token(placeholder) - tokens[token[-3:]] = token_map[placeholder] - return tokens - - -def placeholder_to_token(placeholder): - match = PLACEHOLDER_PATTERN.search(placeholder) - if match: - return match.group("token") - match = YAML_PLACEHOLDER_PATTERN.search(placeholder) - if match: - return match.group("token").replace("_", "-") - return placeholder - - -def yaml_for_round_trip(): - yaml_parser = ruamel.yaml.YAML(typ='rt') - yaml_parser.preserve_quotes = True - yaml_parser.constructor.yaml_constructors.pop(u'tag:yaml.org,2002:timestamp', None) - yaml_parser.indent(sequence=4, offset=2) - yaml_parser.explicit_start = True - yaml_parser.width = 1000000 - return yaml_parser - - -class TokenGenerator: - def __init__(self, relative_file_name): - self.salt = hashlib.sha256(relative_file_name.encode("utf-8")).hexdigest()[:6] - self.index = 0 - - def next_markdown_placeholder(self): - self.index += 1 - return "".format(self.salt, self.index) - - def next_yaml_placeholder(self): - self.index += 1 - return "__NTTT_{}_{:03d}__".format(self.salt, self.index) diff --git a/nttt/tidyup.py b/nttt/tidyup.py index b500214..433c569 100644 --- a/nttt/tidyup.py +++ b/nttt/tidyup.py @@ -10,9 +10,6 @@ from .cleanup_formatting import trim_formatting_tags from .cleanup_sections import fix_sections from .cleanup_sections import revert_section_translation -from .cleanup_alerts import fix_alerts -from .cleanup_alerts import revert_alert_translation -from .cleanup_codeblocks import fix_codeblocks def fix_meta(src, english_src, dst): @@ -66,20 +63,12 @@ def fix_md_step(src, lang, english_src, dst, disable, logging): if en_md_content is not None and "revert_section_translation" not in disable: md_content = revert_section_translation(src, md_content, en_md_content, logging) - if "fix_alerts" not in disable: - md_content = fix_alerts(md_content, logging) - if en_md_content is not None and "revert_alert_translation" not in disable: - md_content = revert_alert_translation(src, md_content, en_md_content, logging) - if "fix_md" not in disable: md_content = trim_md_tags(md_content, logging) if "fix_html" not in disable: md_content = trim_html_tags(md_content, logging) - if "fix_codeblocks" not in disable: - md_content = fix_codeblocks(md_content, en_md_content, logging) - if "fix_formatting" not in disable: md_content = trim_formatting_tags(md_content, logging) diff --git a/unit_test/test_cleanup_alerts.py b/unit_test/test_cleanup_alerts.py deleted file mode 100644 index 905af50..0000000 --- a/unit_test/test_cleanup_alerts.py +++ /dev/null @@ -1,39 +0,0 @@ -import unittest -from nttt import cleanup_alerts - - -class TestCleanupAlerts(unittest.TestCase): - logging = "off" - - def test_fix_alert_spacing_and_case(self): - c_initial = ">[ ! task ] Complete this step." - c_target = "> [!TASK] Complete this step." - - self.assertEqual(cleanup_alerts.fix_alerts(c_initial, self.logging), c_target) - - def test_fix_nested_alert(self): - c_initial = "> >[!hint]\n> >\n> > Try this." - c_target = "> > [!HINT]\n> >\n> > Try this." - - self.assertEqual(cleanup_alerts.fix_alerts(c_initial, self.logging), c_target) - - def test_revert_alert_translation_preserves_title(self): - c_initial = "> [!TAAK] Uitdaging: Verbeter je drum" - c_english = "> [!CHALLENGE] Challenge: Improving your drum" - c_target = "> [!CHALLENGE] Uitdaging: Verbeter je drum" - - self.assertEqual( - cleanup_alerts.revert_alert_translation("step_1.md", c_initial, c_english, self.logging), - c_target) - - def test_revert_alert_translation_skips_when_structure_differs(self): - c_initial = "> [!TAAK]\n\n> [!HINT]" - c_english = "> [!TASK]" - - self.assertEqual( - cleanup_alerts.revert_alert_translation("step_1.md", c_initial, c_english, self.logging), - c_initial) - - -if __name__ == "__main__": - unittest.main() diff --git a/unit_test/test_cleanup_codeblocks.py b/unit_test/test_cleanup_codeblocks.py deleted file mode 100644 index f1c570b..0000000 --- a/unit_test/test_cleanup_codeblocks.py +++ /dev/null @@ -1,40 +0,0 @@ -import unittest -from nttt import cleanup_codeblocks - - -class TestCleanupCodeblocks(unittest.TestCase): - logging = "off" - - def test_fix_codeblock_info_string(self): - c_initial = ('```Python filename = "Button_Press.py" line_numbers = "TRUE" line_highlights = "3, 5-6"\n' - 'print("Hello")\n' - '```') - c_target = ('```python filename="button_press.py" line_numbers="true" line_highlights="3,5-6"\n' - 'print("Hello")\n' - '```') - - self.assertEqual(cleanup_codeblocks.fix_codeblocks(c_initial, None, self.logging), c_target) - - def test_fix_codeblock_uses_english_language_for_unknown_translated_language(self): - c_initial = ('```pythone filename="button.py"\n' - 'print("Hello")\n' - '```') - c_english = ('```python filename="button.py"\n' - 'print("Hello")\n' - '```') - c_target = ('```python filename="button.py"\n' - 'print("Hello")\n' - '```') - - self.assertEqual(cleanup_codeblocks.fix_codeblocks(c_initial, c_english, self.logging), c_target) - - def test_does_not_change_plain_fences(self): - c_initial = ('```\n' - 'filename = "Button_Press.py"\n' - '```') - - self.assertEqual(cleanup_codeblocks.fix_codeblocks(c_initial, None, self.logging), c_initial) - - -if __name__ == "__main__": - unittest.main() diff --git a/unit_test/test_strip_restore.py b/unit_test/test_strip_restore.py deleted file mode 100644 index d522465..0000000 --- a/unit_test/test_strip_restore.py +++ /dev/null @@ -1,106 +0,0 @@ -import unittest -from nttt import restore -from nttt import strip - - -class TestStripRestore(unittest.TestCase): - def test_strip_md_replaces_non_translatable_markers(self): - c_initial = ('--- task ---\n' - '\n' - 'Complete this step.\n' - '\n' - '> [!HINT]\n' - '>\n' - '> Try this.\n' - '\n' - '```python filename="button.py"\n' - 'print("Hello")\n' - '```\n' - '\n' - 'The `Looks`{:class="block3looks"} category.\n' - '\n' - '--- /task ---') - - stripped, token_map = strip.strip_md(c_initial, "step_1.md") - - self.assertIn("\nTranslated text." - c_english = "English text." - - self.assertEqual(restore.restore_md(c_initial, c_english, "step_1.md", "step_1.md"), c_initial) - - def test_strip_restore_roundtrip_is_identity_for_english(self): - c_english = ('> [!TASK]\n' - '>\n' - '> Complete this step.\n' - '\n' - '```python filename="button.py"\n' - 'print("Hello")\n' - '```') - stripped, _ = strip.strip_md(c_english, "step_1.md") - - self.assertEqual(restore.restore_md(stripped, c_english, "step_1.md", "step_1.md"), c_english) - - def test_strip_meta_yaml_removes_non_translatable_keys(self): - c_initial = ('---\n' - 'title: Test project\n' - 'hero_image: images/banner.png\n' - 'description: A project\n') - - stripped, _ = strip.strip_meta_yaml(c_initial, "meta.yml") - - self.assertIn("title: Test project", stripped) - self.assertIn("description: A project", stripped) - self.assertNotIn("hero_image", stripped) - - def test_restore_meta_yaml_merges_translated_keys_into_english(self): - c_english = ('---\n' - 'title: Test project\n' - 'hero_image: images/banner.png\n' - 'description: A project\n') - c_translated = ('---\n' - 'title: Testproject\n' - 'description: Een project\n') - - restored = restore.restore_meta_yaml(c_translated, c_english) - - self.assertIn("title: Testproject", restored) - self.assertIn("description: Een project", restored) - self.assertIn("hero_image: images/banner.png", restored) - - -if __name__ == "__main__": - unittest.main() From 26aa04e11d38b016a8ab2bc0dc5fe5903533cad4 Mon Sep 17 00:00:00 2001 From: Jo Humphrey <31373245+jamdelion@users.noreply.github.com> Date: Tue, 5 May 2026 14:48:22 +0100 Subject: [PATCH 5/7] New AI attempt --- README.md | 22 ++++ doc/transformations.md | 35 +++++- nttt/__init__.py | 16 ++- nttt/arguments.py | 13 ++- nttt/constants.py | 7 ++ nttt/markers.py | 72 ++++++++++++ nttt/restore.py | 135 ++++++++++++++++++++++ nttt/strip.py | 72 ++++++++++++ nttt/tidyup.py | 5 + unit_test/test_arguments.py | 4 + unit_test/test_markers.py | 52 +++++++++ unit_test/test_restore.py | 57 +++++++++ unit_test/test_strip.py | 67 +++++++++++ unit_test/test_strip_restore_roundtrip.py | 26 +++++ 14 files changed, 575 insertions(+), 8 deletions(-) create mode 100644 nttt/markers.py create mode 100644 nttt/restore.py create mode 100644 nttt/strip.py create mode 100644 unit_test/test_markers.py create mode 100644 unit_test/test_restore.py create mode 100644 unit_test/test_strip.py create mode 100644 unit_test/test_strip_restore_roundtrip.py diff --git a/README.md b/README.md index 717760d..e8f2214 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,28 @@ You can specify different directories for the input and output folder using the nttt --input c:\path\to\project\de-DE --output c:\path\to\project\de-DE-tidy ``` +### Crowdin marker stripping and restoring + +NTTT has three processing modes: + +- `tidy` (default): restore stripped Markdown markers for non-English locale folders, then run the existing tidy-up transforms. +- `strip`: remove non-translatable Markdown markers before uploading English source files to Crowdin. +- `restore`: reinsert stripped Markdown markers into translated files after downloading from Crowdin. + +Use `strip` on the English source folder before Crowdin upload: + +```bash +nttt --mode strip -i en -o en -Y on +``` + +Use `restore` on a translated locale folder after Crowdin download: + +```bash +nttt --mode restore -i de-DE -e en -o de-DE -Y on +``` + +Modern bare markers such as `> [!TASK]` are removed entirely, along with their paired empty `>` line. Modern labelled markers such as `> [!ACCORDION] Where are my voice recordings stored?` keep the label available for translation by becoming `> Where are my voice recordings stored?`; restore reinserts `[!ACCORDION]` before the translated label. Legacy markers such as `--- task ---` and `--- /task ---` are also removed and restored by line alignment against `en/`. + ### Help To bring up full usage information use the `-h`/`--help` option. diff --git a/doc/transformations.md b/doc/transformations.md index 733a174..2573823 100644 --- a/doc/transformations.md +++ b/doc/transformations.md @@ -16,12 +16,13 @@ NTTT does **not** process standalone `.html` files. HTML-related steps run on ** For each `.md` file, [`nttt/tidyup.py`](../nttt/tidyup.py) applies, in order: -1. **`fix_sections`** — normalise `---` section lines (Crowdin quirks). -2. **`revert_section_translation`** — optional; restore English section tag lines when structure matches. -3. **`trim_md_tags`** — strip padding inside paired Markdown delimiters (outside ` ``` ` fences). -4. **`trim_html_tags`** — strip padding inside simple inline HTML tags (outside single `` ` `` spans). -5. **`trim_formatting_tags`** — normalise `{ … }` attribute blocks after a word (Scratch/Pico-style). -6. **URL rewrite:** replace `/en/` with `//` everywhere in the file body. +1. **`restore_tree`** — for non-English locale folders, restore Markdown markers stripped before Crowdin upload. +2. **`fix_sections`** — normalise `---` section lines (Crowdin quirks). +3. **`revert_section_translation`** — optional; restore English section tag lines when structure matches. +4. **`trim_md_tags`** — strip padding inside paired Markdown delimiters (outside ` ``` ` fences). +5. **`trim_html_tags`** — strip padding inside simple inline HTML tags (outside single `` ` `` spans). +6. **`trim_formatting_tags`** — normalise `{ … }` attribute blocks after a word (Scratch/Pico-style). +7. **URL rewrite:** replace `/en/` with `//` everywhere in the file body. Steps 1–5 can be skipped via **`--disable`** (see [`nttt/arguments.py`](../nttt/arguments.py)). @@ -29,6 +30,28 @@ Steps 1–5 can be skipped via **`--disable`** (see [`nttt/arguments.py`](../ntt --- +## Crowdin marker strip/restore (`nttt/strip.py`, `nttt/restore.py`) + +**Modes:** `--mode strip`, `--mode restore`, and default `--mode tidy`. + +| Mode | Behaviour | +|------|-----------| +| `strip` | Runs on `en/` before Crowdin upload. Removes structural-only markers and keeps labelled marker text translatable. | +| `restore` | Runs on a locale folder after Crowdin download. Rebuilds markers from the matching English file. | +| `tidy` | For non-English locale folders, runs restore first, then the existing tidy transforms. | + +**Marker classification (`nttt/markers.py`):** + +| Kind | Pattern | Strip output | Restore output | +|------|---------|--------------|----------------| +| Modern bare | `> [!TASK]`, `> [!SAVE]`, nested forms like `> > [!HINT]` | Dropped. A following empty blockquote line (`>`, `> >`) is also dropped. | Copied back from `en/`. | +| Modern labelled | `> [!ACCORDION] Where are my voice recordings stored?` | Rewritten to `> Where are my voice recordings stored?`. | Rewritten to `> [!ACCORDION] `. | +| Legacy bare | `--- task ---`, `--- /task ---`, `--- print-only ---`, `--- feedback ---` | Dropped. | Copied back from `en/`. | + +Restore uses line-index alignment against the stripped English file. If the translated file has a different number of lines from the stripped English reference, NTTT logs a warning and leaves that file unchanged for this step. + +Fenced code blocks split by ` ``` ` are not stripped. + ## 1. Section markers (`nttt/cleanup_sections.py`) **Function:** `fix_sections` diff --git a/nttt/__init__.py b/nttt/__init__.py index 9976cfb..546ab30 100644 --- a/nttt/__init__.py +++ b/nttt/__init__.py @@ -1,4 +1,7 @@ from .arguments import parse_command_line, resolve_arguments, check_arguments, show_arguments +from .constants import ArgumentKeyConstants, Modes +from .restore import restore_tree +from .strip import strip_tree from .tidyup import tidyup_translations from ._version import __version__ @@ -7,4 +10,15 @@ def main(): resolved_arguments = resolve_arguments(command_line_args) show_arguments(resolved_arguments) if (check_arguments(resolved_arguments)): - tidyup_translations(resolved_arguments) + mode = resolved_arguments[ArgumentKeyConstants.MODE] + if mode == Modes.STRIP: + strip_tree( + resolved_arguments[ArgumentKeyConstants.INPUT], + resolved_arguments[ArgumentKeyConstants.OUTPUT]) + elif mode == Modes.RESTORE: + restore_tree( + resolved_arguments[ArgumentKeyConstants.INPUT], + resolved_arguments[ArgumentKeyConstants.ENGLISH], + resolved_arguments[ArgumentKeyConstants.OUTPUT]) + else: + tidyup_translations(resolved_arguments) diff --git a/nttt/arguments.py b/nttt/arguments.py index 35f76b0..6e2ca09 100644 --- a/nttt/arguments.py +++ b/nttt/arguments.py @@ -1,4 +1,4 @@ -from .constants import ArgumentKeyConstants +from .constants import ArgumentKeyConstants, Modes import os from pathlib import Path from argparse import ArgumentParser @@ -51,6 +51,11 @@ def parse_command_line(version): parser.add_argument("-l", "--language", help="The language of the content to be tidied up, defaults to basename(INPUT).") parser.add_argument("-v", "--volunteers", help="The list of volunteers as a comma separated list, defaults to an empty list.") parser.add_argument("-f", "--final", help="The number of the final step file, defaults to the step file with the highest number.") + parser.add_argument("-m", "--mode", choices=[Modes.TIDY, Modes.STRIP, Modes.RESTORE], + help="The processing mode. Options are: tidy (default cleanup), " + "strip (remove non-translatable structural markers before Crowdin upload), " + "restore (restore stripped structural markers after Crowdin download). " + "Default is tidy.") parser.add_argument("-D", "--Disable", help="The risky features to be disabled, separated by commas. " "Options are: fix_md (fix common markdown-related issues), " "fix_html (fix common issues in HTML-like tags (Return)), " @@ -120,6 +125,11 @@ def resolve_arguments(command_line_args): else: arguments[ArgumentKeyConstants.YES] = "off" + if hasattr(command_line_args, "mode") and command_line_args.mode: + arguments[ArgumentKeyConstants.MODE] = command_line_args.mode + else: + arguments[ArgumentKeyConstants.MODE] = Modes.TIDY + return arguments @@ -138,6 +148,7 @@ def show_arguments(arguments): print("Disabled functions - '{}'".format(arguments[ArgumentKeyConstants.DISABLE])) print("Logging - '{}'".format(arguments[ArgumentKeyConstants.LOGGING])) print("Yes - '{}'".format(arguments[ArgumentKeyConstants.YES])) + print("Mode - '{}'".format(arguments[ArgumentKeyConstants.MODE])) def check_folder(folder): diff --git a/nttt/constants.py b/nttt/constants.py index ce14cee..1b08b17 100644 --- a/nttt/constants.py +++ b/nttt/constants.py @@ -17,6 +17,13 @@ class ArgumentKeyConstants: DISABLE = 'DISABLE' LOGGING = 'LOGGING' YES = 'YES' + MODE = 'MODE' + + +class Modes: + TIDY = "tidy" + STRIP = "strip" + RESTORE = "restore" class RegexConstants: diff --git a/nttt/markers.py b/nttt/markers.py new file mode 100644 index 0000000..9fc1fd8 --- /dev/null +++ b/nttt/markers.py @@ -0,0 +1,72 @@ +import re + + +LINE_KIND_BARE_MARKER = "bare" +LINE_KIND_LABELLED_MARKER = "labelled" +LINE_KIND_PAIRED_EMPTY_BLOCKQUOTE = "paired_empty_blockquote" +LINE_KIND_REGULAR = "regular" + + +MODERN_BARE_MARKER_PATTERN = re.compile( + r'^(?P\s*(?:>\s*)+)\[!(?P[A-Z][A-Z0-9_-]*)\]\s*$' +) + +MODERN_LABELLED_MARKER_PATTERN = re.compile( + r'^(?P\s*(?:>\s*)+)\[!(?P[A-Z][A-Z0-9_-]*)\]\s+(?P