|
43 | 43 | Phase 1: tokenize title + body separately for every article in the wiki. |
44 | 44 | Per-article record: url@@@title@@@cat@@@|title_toks|@@@|body_toks| |
45 | 45 | Article separator: ### |
| 46 | + Per-article tokenization is factored into _includes/see-also-tokenize.html |
| 47 | + so the same logic runs for both regular `cat.children` entries and the rare |
| 48 | + parent-as-page cat (e.g. "Robotics Project Guide" → master-guide.md). |
46 | 49 | ============================================================ {%- endcomment -%} |
47 | 50 | {%- assign blob = "" -%} |
48 | 51 | {%- for cat in site.data.navigation.wiki -%} |
49 | 52 | {%- if cat.title == "Overview" -%}{%- continue -%}{%- endif -%} |
50 | | - {%- if cat.children -%} |
51 | | - {%- for child in cat.children -%} |
52 | | - {%- assign norm_url = child.url | append: "/" | replace: "//", "/" -%} |
53 | | - {%- assign page = site.pages | where: "url", norm_url | first -%} |
54 | | - |
55 | | - {%- assign title_text = child.title | downcase -%} |
56 | | - {%- assign title_text = title_text | replace: ".", " " | replace: ",", " " | replace: ";", " " | replace: ":", " " | replace: "(", " " | replace: ")", " " | replace: "[", " " | replace: "]", " " | replace: "{", " " | replace: "}", " " | replace: "/", " " | replace: "-", " " | replace: "_", " " | replace: "&", " " | replace: "?", " " | replace: "!", " " | replace: "=", " " | replace: "+", " " | replace: "*", " " | replace: "'", " " | replace: '"', " " | replace: "\\", " " | replace: "<", " " | replace: ">", " " | replace: "#", " " -%} |
57 | | - {%- assign title_wrapped = "|" -%} |
58 | | - {%- assign title_raw = title_text | split: " " -%} |
59 | | - {%- for tok in title_raw -%} |
60 | | - {%- assign t = tok | strip -%} |
61 | | - {%- if t.size < 3 or t.size > 24 -%}{%- continue -%}{%- endif -%} |
62 | | - {%- if STOP contains t -%}{%- continue -%}{%- endif -%} |
63 | | - {%- assign first_char = t | slice: 0, 1 -%} |
64 | | - {%- if first_char == "0" or first_char == "1" or first_char == "2" or first_char == "3" or first_char == "4" or first_char == "5" or first_char == "6" or first_char == "7" or first_char == "8" or first_char == "9" -%}{%- continue -%}{%- endif -%} |
65 | | - {%- comment -%} Light stemming: strip trailing 's' to unify plurals |
66 | | - (apriltags->apriltag, sensors->sensor). Skip if "ss" (address) or |
67 | | - short (toss). Naive but effective on technical English. {%- endcomment -%} |
68 | | - {%- assign last2 = t | slice: -2, 2 -%} |
69 | | - {%- if t.size > 4 and last2 != "ss" -%} |
70 | | - {%- assign last1 = t | slice: -1, 1 -%} |
71 | | - {%- if last1 == "s" -%} |
72 | | - {%- assign sz_m = t.size | minus: 1 -%} |
73 | | - {%- assign t = t | slice: 0, sz_m -%} |
74 | | - {%- endif -%} |
75 | | - {%- endif -%} |
76 | | - {%- assign needle = "|" | append: t | append: "|" -%} |
77 | | - {%- if title_wrapped contains needle -%}{%- continue -%}{%- endif -%} |
78 | | - {%- assign title_wrapped = title_wrapped | append: t | append: "|" -%} |
79 | | - {%- endfor -%} |
80 | | - |
81 | | - {%- assign body_wrapped = "|" -%} |
82 | | - {%- if page and page.content -%} |
83 | | - {%- assign body_text = page.content | strip_html | strip_newlines | truncate: BODY_LEAD_CHARS, "" | downcase -%} |
84 | | - {%- assign body_text = body_text | replace: ".", " " | replace: ",", " " | replace: ";", " " | replace: ":", " " | replace: "(", " " | replace: ")", " " | replace: "[", " " | replace: "]", " " | replace: "{", " " | replace: "}", " " | replace: "/", " " | replace: "-", " " | replace: "_", " " | replace: "&", " " | replace: "?", " " | replace: "!", " " | replace: "=", " " | replace: "+", " " | replace: "*", " " | replace: "'", " " | replace: '"', " " | replace: "\\", " " | replace: "<", " " | replace: ">", " " | replace: "#", " " -%} |
85 | | - {%- comment -%} Body dedup is seeded with title_wrapped so we never |
86 | | - double-count tokens that appeared in the title. {%- endcomment -%} |
87 | | - {%- assign b_seen = title_wrapped -%} |
88 | | - {%- assign body_raw = body_text | split: " " -%} |
89 | | - {%- for tok in body_raw -%} |
90 | | - {%- assign t = tok | strip -%} |
91 | | - {%- if t.size < 4 or t.size > 24 -%}{%- continue -%}{%- endif -%} |
92 | | - {%- if STOP contains t -%}{%- continue -%}{%- endif -%} |
93 | | - {%- assign first_char = t | slice: 0, 1 -%} |
94 | | - {%- if first_char == "0" or first_char == "1" or first_char == "2" or first_char == "3" or first_char == "4" or first_char == "5" or first_char == "6" or first_char == "7" or first_char == "8" or first_char == "9" -%}{%- continue -%}{%- endif -%} |
95 | | - {%- assign last2 = t | slice: -2, 2 -%} |
96 | | - {%- if t.size > 4 and last2 != "ss" -%} |
97 | | - {%- assign last1 = t | slice: -1, 1 -%} |
98 | | - {%- if last1 == "s" -%} |
99 | | - {%- assign sz_m = t.size | minus: 1 -%} |
100 | | - {%- assign t = t | slice: 0, sz_m -%} |
101 | | - {%- endif -%} |
102 | | - {%- endif -%} |
103 | | - {%- assign needle = "|" | append: t | append: "|" -%} |
104 | | - {%- if b_seen contains needle -%}{%- continue -%}{%- endif -%} |
105 | | - {%- assign b_seen = b_seen | append: t | append: "|" -%} |
106 | | - {%- assign body_wrapped = body_wrapped | append: t | append: "|" -%} |
107 | | - {%- endfor -%} |
| 53 | + |
| 54 | + {%- comment -%} Parent-as-page nav entry (e.g. Robotics Project Guide → master-guide): |
| 55 | + include the cat as an article only when its URL has a slug after the category |
| 56 | + (/wiki/foo/bar/), not a bare category landing (/wiki/foo/) — those resolve to |
| 57 | + auto-generated index pages with generic titles that pollute the recommender. {%- endcomment -%} |
| 58 | + {%- if cat.url -%} |
| 59 | + {%- assign cat_norm_url = cat.url | append: "/" | replace: "//", "/" -%} |
| 60 | + {%- assign cat_suffix = cat_norm_url | remove_first: "/wiki/" | replace: "/", " " | strip -%} |
| 61 | + {%- if cat_suffix contains " " -%} |
| 62 | + {%- assign cat_page = site.pages | where: "url", cat_norm_url | first -%} |
| 63 | + {%- if cat_page and cat_page.content -%} |
| 64 | + {%- include see-also-tokenize.html article=cat cat_title=cat.title -%} |
108 | 65 | {%- endif -%} |
| 66 | + {%- endif -%} |
| 67 | + {%- endif -%} |
109 | 68 |
|
110 | | - {%- assign blob = blob | append: child.url | append: "@@@" | append: child.title | append: "@@@" | append: cat.title | append: "@@@" | append: title_wrapped | append: "@@@" | append: body_wrapped | append: "###" -%} |
| 69 | + {%- if cat.children -%} |
| 70 | + {%- for child in cat.children -%} |
| 71 | + {%- include see-also-tokenize.html article=child cat_title=cat.title -%} |
111 | 72 | {%- endfor -%} |
112 | 73 | {%- endif -%} |
113 | 74 | {%- endfor -%} |
|
119 | 80 | is a Liquid-friendly stand-in for IDF weighting (no log() in Liquid). The |
120 | 81 | Lucene MoreLikeThis paper and the BM25 reproducibility study both find |
121 | 82 | binned IDF nearly indistinguishable from continuous IDF in practice. |
122 | | - Title tokens skip this stage — they get a uniform TITLE_WEIGHT in scoring. |
| 83 | + Title tokens are not iterated here — they get a uniform TITLE_WEIGHT in |
| 84 | + scoring. Note tok_freq counts |tok| occurrences across the whole blob |
| 85 | + (title + body segments), so a body token whose word also appears in many |
| 86 | + titles inherits those into its DF bucket. Hyperparameters were tuned |
| 87 | + against this exact count, not against a body-only DF. |
123 | 88 | {%- endcomment -%} |
124 | 89 | {%- assign rare_set = "|" -%} |
125 | 90 | {%- assign medium_set = "|" -%} |
|
0 commit comments