Skip to content

Commit 86daddd

Browse files
committed
Fix see-also: URL normalization, parent-as-page indexing, DF comment
1 parent e4d70a2 commit 86daddd

2 files changed

Lines changed: 92 additions & 60 deletions

File tree

_includes/see-also-tokenize.html

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
{%- comment -%}
2+
Per-article tokenization helper for assets/see-also.html.
3+
Parameters:
4+
include.article — hash with .title and .url (a child or parent-as-page cat).
5+
include.cat_title — string, the parent category's title.
6+
Side effect: appends one `url@@@title@@@cat@@@|title_toks|@@@|body_toks|###`
7+
record to the outer `blob` variable.
8+
{%- endcomment -%}
9+
{%- assign norm_url = include.article.url | append: "/" | replace: "//", "/" -%}
10+
{%- assign page = site.pages | where: "url", norm_url | first -%}
11+
12+
{%- assign title_text = include.article.title | downcase -%}
13+
{%- assign title_text = title_text | replace: ".", " " | replace: ",", " " | replace: ";", " " | replace: ":", " " | replace: "(", " " | replace: ")", " " | replace: "[", " " | replace: "]", " " | replace: "{", " " | replace: "}", " " | replace: "/", " " | replace: "-", " " | replace: "_", " " | replace: "&", " " | replace: "?", " " | replace: "!", " " | replace: "=", " " | replace: "+", " " | replace: "*", " " | replace: "'", " " | replace: '"', " " | replace: "\\", " " | replace: "<", " " | replace: ">", " " | replace: "#", " " -%}
14+
{%- assign title_wrapped = "|" -%}
15+
{%- assign title_raw = title_text | split: " " -%}
16+
{%- for tok in title_raw -%}
17+
{%- assign t = tok | strip -%}
18+
{%- if t.size < 3 or t.size > 24 -%}{%- continue -%}{%- endif -%}
19+
{%- if STOP contains t -%}{%- continue -%}{%- endif -%}
20+
{%- assign first_char = t | slice: 0, 1 -%}
21+
{%- if first_char == "0" or first_char == "1" or first_char == "2" or first_char == "3" or first_char == "4" or first_char == "5" or first_char == "6" or first_char == "7" or first_char == "8" or first_char == "9" -%}{%- continue -%}{%- endif -%}
22+
{%- comment -%} Light stemming: strip trailing 's' to unify plurals
23+
(apriltags->apriltag, sensors->sensor). Skip if "ss" (address) or
24+
short (toss). Naive but effective on technical English. {%- endcomment -%}
25+
{%- assign last2 = t | slice: -2, 2 -%}
26+
{%- if t.size > 4 and last2 != "ss" -%}
27+
{%- assign last1 = t | slice: -1, 1 -%}
28+
{%- if last1 == "s" -%}
29+
{%- assign sz_m = t.size | minus: 1 -%}
30+
{%- assign t = t | slice: 0, sz_m -%}
31+
{%- endif -%}
32+
{%- endif -%}
33+
{%- assign needle = "|" | append: t | append: "|" -%}
34+
{%- if title_wrapped contains needle -%}{%- continue -%}{%- endif -%}
35+
{%- assign title_wrapped = title_wrapped | append: t | append: "|" -%}
36+
{%- endfor -%}
37+
38+
{%- assign body_wrapped = "|" -%}
39+
{%- if page and page.content -%}
40+
{%- assign body_text = page.content | strip_html | strip_newlines | truncate: BODY_LEAD_CHARS, "" | downcase -%}
41+
{%- assign body_text = body_text | replace: ".", " " | replace: ",", " " | replace: ";", " " | replace: ":", " " | replace: "(", " " | replace: ")", " " | replace: "[", " " | replace: "]", " " | replace: "{", " " | replace: "}", " " | replace: "/", " " | replace: "-", " " | replace: "_", " " | replace: "&", " " | replace: "?", " " | replace: "!", " " | replace: "=", " " | replace: "+", " " | replace: "*", " " | replace: "'", " " | replace: '"', " " | replace: "\\", " " | replace: "<", " " | replace: ">", " " | replace: "#", " " -%}
42+
{%- comment -%} Body dedup is seeded with title_wrapped so we never
43+
double-count tokens that appeared in the title. {%- endcomment -%}
44+
{%- assign b_seen = title_wrapped -%}
45+
{%- assign body_raw = body_text | split: " " -%}
46+
{%- for tok in body_raw -%}
47+
{%- assign t = tok | strip -%}
48+
{%- if t.size < 4 or t.size > 24 -%}{%- continue -%}{%- endif -%}
49+
{%- if STOP contains t -%}{%- continue -%}{%- endif -%}
50+
{%- assign first_char = t | slice: 0, 1 -%}
51+
{%- if first_char == "0" or first_char == "1" or first_char == "2" or first_char == "3" or first_char == "4" or first_char == "5" or first_char == "6" or first_char == "7" or first_char == "8" or first_char == "9" -%}{%- continue -%}{%- endif -%}
52+
{%- assign last2 = t | slice: -2, 2 -%}
53+
{%- if t.size > 4 and last2 != "ss" -%}
54+
{%- assign last1 = t | slice: -1, 1 -%}
55+
{%- if last1 == "s" -%}
56+
{%- assign sz_m = t.size | minus: 1 -%}
57+
{%- assign t = t | slice: 0, sz_m -%}
58+
{%- endif -%}
59+
{%- endif -%}
60+
{%- assign needle = "|" | append: t | append: "|" -%}
61+
{%- if b_seen contains needle -%}{%- continue -%}{%- endif -%}
62+
{%- assign b_seen = b_seen | append: t | append: "|" -%}
63+
{%- assign body_wrapped = body_wrapped | append: t | append: "|" -%}
64+
{%- endfor -%}
65+
{%- endif -%}
66+
67+
{%- assign blob = blob | append: norm_url | append: "@@@" | append: include.article.title | append: "@@@" | append: include.cat_title | append: "@@@" | append: title_wrapped | append: "@@@" | append: body_wrapped | append: "###" -%}

assets/see-also.html

Lines changed: 25 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -43,71 +43,32 @@
4343
Phase 1: tokenize title + body separately for every article in the wiki.
4444
Per-article record: url@@@title@@@cat@@@|title_toks|@@@|body_toks|
4545
Article separator: ###
46+
Per-article tokenization is factored into _includes/see-also-tokenize.html
47+
so the same logic runs for both regular `cat.children` entries and the rare
48+
parent-as-page cat (e.g. "Robotics Project Guide" → master-guide.md).
4649
============================================================ {%- endcomment -%}
4750
{%- assign blob = "" -%}
4851
{%- for cat in site.data.navigation.wiki -%}
4952
{%- if cat.title == "Overview" -%}{%- continue -%}{%- endif -%}
50-
{%- if cat.children -%}
51-
{%- for child in cat.children -%}
52-
{%- assign norm_url = child.url | append: "/" | replace: "//", "/" -%}
53-
{%- assign page = site.pages | where: "url", norm_url | first -%}
54-
55-
{%- assign title_text = child.title | downcase -%}
56-
{%- assign title_text = title_text | replace: ".", " " | replace: ",", " " | replace: ";", " " | replace: ":", " " | replace: "(", " " | replace: ")", " " | replace: "[", " " | replace: "]", " " | replace: "{", " " | replace: "}", " " | replace: "/", " " | replace: "-", " " | replace: "_", " " | replace: "&", " " | replace: "?", " " | replace: "!", " " | replace: "=", " " | replace: "+", " " | replace: "*", " " | replace: "'", " " | replace: '"', " " | replace: "\\", " " | replace: "<", " " | replace: ">", " " | replace: "#", " " -%}
57-
{%- assign title_wrapped = "|" -%}
58-
{%- assign title_raw = title_text | split: " " -%}
59-
{%- for tok in title_raw -%}
60-
{%- assign t = tok | strip -%}
61-
{%- if t.size < 3 or t.size > 24 -%}{%- continue -%}{%- endif -%}
62-
{%- if STOP contains t -%}{%- continue -%}{%- endif -%}
63-
{%- assign first_char = t | slice: 0, 1 -%}
64-
{%- if first_char == "0" or first_char == "1" or first_char == "2" or first_char == "3" or first_char == "4" or first_char == "5" or first_char == "6" or first_char == "7" or first_char == "8" or first_char == "9" -%}{%- continue -%}{%- endif -%}
65-
{%- comment -%} Light stemming: strip trailing 's' to unify plurals
66-
(apriltags->apriltag, sensors->sensor). Skip if "ss" (address) or
67-
short (toss). Naive but effective on technical English. {%- endcomment -%}
68-
{%- assign last2 = t | slice: -2, 2 -%}
69-
{%- if t.size > 4 and last2 != "ss" -%}
70-
{%- assign last1 = t | slice: -1, 1 -%}
71-
{%- if last1 == "s" -%}
72-
{%- assign sz_m = t.size | minus: 1 -%}
73-
{%- assign t = t | slice: 0, sz_m -%}
74-
{%- endif -%}
75-
{%- endif -%}
76-
{%- assign needle = "|" | append: t | append: "|" -%}
77-
{%- if title_wrapped contains needle -%}{%- continue -%}{%- endif -%}
78-
{%- assign title_wrapped = title_wrapped | append: t | append: "|" -%}
79-
{%- endfor -%}
80-
81-
{%- assign body_wrapped = "|" -%}
82-
{%- if page and page.content -%}
83-
{%- assign body_text = page.content | strip_html | strip_newlines | truncate: BODY_LEAD_CHARS, "" | downcase -%}
84-
{%- assign body_text = body_text | replace: ".", " " | replace: ",", " " | replace: ";", " " | replace: ":", " " | replace: "(", " " | replace: ")", " " | replace: "[", " " | replace: "]", " " | replace: "{", " " | replace: "}", " " | replace: "/", " " | replace: "-", " " | replace: "_", " " | replace: "&", " " | replace: "?", " " | replace: "!", " " | replace: "=", " " | replace: "+", " " | replace: "*", " " | replace: "'", " " | replace: '"', " " | replace: "\\", " " | replace: "<", " " | replace: ">", " " | replace: "#", " " -%}
85-
{%- comment -%} Body dedup is seeded with title_wrapped so we never
86-
double-count tokens that appeared in the title. {%- endcomment -%}
87-
{%- assign b_seen = title_wrapped -%}
88-
{%- assign body_raw = body_text | split: " " -%}
89-
{%- for tok in body_raw -%}
90-
{%- assign t = tok | strip -%}
91-
{%- if t.size < 4 or t.size > 24 -%}{%- continue -%}{%- endif -%}
92-
{%- if STOP contains t -%}{%- continue -%}{%- endif -%}
93-
{%- assign first_char = t | slice: 0, 1 -%}
94-
{%- if first_char == "0" or first_char == "1" or first_char == "2" or first_char == "3" or first_char == "4" or first_char == "5" or first_char == "6" or first_char == "7" or first_char == "8" or first_char == "9" -%}{%- continue -%}{%- endif -%}
95-
{%- assign last2 = t | slice: -2, 2 -%}
96-
{%- if t.size > 4 and last2 != "ss" -%}
97-
{%- assign last1 = t | slice: -1, 1 -%}
98-
{%- if last1 == "s" -%}
99-
{%- assign sz_m = t.size | minus: 1 -%}
100-
{%- assign t = t | slice: 0, sz_m -%}
101-
{%- endif -%}
102-
{%- endif -%}
103-
{%- assign needle = "|" | append: t | append: "|" -%}
104-
{%- if b_seen contains needle -%}{%- continue -%}{%- endif -%}
105-
{%- assign b_seen = b_seen | append: t | append: "|" -%}
106-
{%- assign body_wrapped = body_wrapped | append: t | append: "|" -%}
107-
{%- endfor -%}
53+
54+
{%- comment -%} Parent-as-page nav entry (e.g. Robotics Project Guide → master-guide):
55+
include the cat as an article only when its URL has a slug after the category
56+
(/wiki/foo/bar/), not a bare category landing (/wiki/foo/) — those resolve to
57+
auto-generated index pages with generic titles that pollute the recommender. {%- endcomment -%}
58+
{%- if cat.url -%}
59+
{%- assign cat_norm_url = cat.url | append: "/" | replace: "//", "/" -%}
60+
{%- assign cat_suffix = cat_norm_url | remove_first: "/wiki/" | replace: "/", " " | strip -%}
61+
{%- if cat_suffix contains " " -%}
62+
{%- assign cat_page = site.pages | where: "url", cat_norm_url | first -%}
63+
{%- if cat_page and cat_page.content -%}
64+
{%- include see-also-tokenize.html article=cat cat_title=cat.title -%}
10865
{%- endif -%}
66+
{%- endif -%}
67+
{%- endif -%}
10968

110-
{%- assign blob = blob | append: child.url | append: "@@@" | append: child.title | append: "@@@" | append: cat.title | append: "@@@" | append: title_wrapped | append: "@@@" | append: body_wrapped | append: "###" -%}
69+
{%- if cat.children -%}
70+
{%- for child in cat.children -%}
71+
{%- include see-also-tokenize.html article=child cat_title=cat.title -%}
11172
{%- endfor -%}
11273
{%- endif -%}
11374
{%- endfor -%}
@@ -119,7 +80,11 @@
11980
is a Liquid-friendly stand-in for IDF weighting (no log() in Liquid). The
12081
Lucene MoreLikeThis paper and the BM25 reproducibility study both find
12182
binned IDF nearly indistinguishable from continuous IDF in practice.
122-
Title tokens skip this stage — they get a uniform TITLE_WEIGHT in scoring.
83+
Title tokens are not iterated here — they get a uniform TITLE_WEIGHT in
84+
scoring. Note tok_freq counts |tok| occurrences across the whole blob
85+
(title + body segments), so a body token whose word also appears in many
86+
titles inherits those into its DF bucket. Hyperparameters were tuned
87+
against this exact count, not against a body-only DF.
12388
{%- endcomment -%}
12489
{%- assign rare_set = "|" -%}
12590
{%- assign medium_set = "|" -%}

0 commit comments

Comments
 (0)