Skip to content

Commit 602f434

Browse files
authored
Merge pull request #179 from julwrites/migrate-client-html-parsing-7338505798009705932
Refactor passage HTML parsing for Telegram compatibility
2 parents 1136d03 + a1d1f38 commit 602f434

2 files changed

Lines changed: 93 additions & 54 deletions

File tree

pkg/app/passage.go

Lines changed: 54 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"log"
99
"net/url"
1010
"strings"
11+
stdhtml "html"
1112

1213
"golang.org/x/net/html"
1314

@@ -59,20 +60,30 @@ func isNextSiblingBr(node *html.Node) bool {
5960
}
6061

6162
func ParseNodesForPassage(node *html.Node) string {
62-
var text string
6363
var parts []string
6464

6565
for child := node.FirstChild; child != nil; child = child.NextSibling {
66-
parts = append(parts, text)
66+
// Filter out footnotes sections/cross-refs if they appear as divs
67+
if child.Type == html.ElementNode {
68+
for _, attr := range child.Attr {
69+
if attr.Key == "class" {
70+
if strings.Contains(attr.Val, "footnotes") || strings.Contains(attr.Val, "cross-refs") {
71+
continue
72+
}
73+
}
74+
}
75+
}
6776

6877
switch tag := child.Data; tag {
6978
case "span":
79+
// Keep existing logic for span (likely poetry lines in legacy/scraped HTML)
7080
childText := ParseNodesForPassage(child)
7181
parts = append(parts, childText)
7282
if len(strings.TrimSpace(childText)) > 0 && !isNextSiblingBr(child) {
7383
parts = append(parts, "\n")
7484
}
7585
case "sup":
86+
// Handle superscripts (verse numbers/footnotes)
7687
isFootnote := func(node *html.Node) bool {
7788
for _, attr := range node.Attr {
7889
if attr.Key == "class" && attr.Val == "footnote" {
@@ -85,67 +96,62 @@ func ParseNodesForPassage(node *html.Node) string {
8596
break
8697
}
8798
childText := ParseNodesForPassage(child)
99+
// Use TelegramSuperscript for unicode conversion
88100
if len(childText) > 0 {
89-
parts = append(parts, fmt.Sprintf("^%s^", childText))
101+
parts = append(parts, platform.TelegramSuperscript(childText))
90102
}
91103
break
92104
case "p":
93105
parts = append(parts, ParseNodesForPassage(child))
94-
break
95-
case "b":
96-
parts = append(parts, platform.TelegramBold(ParseNodesForPassage(child)))
97-
case "i":
98-
parts = append(parts, platform.TelegramItalics(ParseNodesForPassage(child)))
99-
break
106+
parts = append(parts, "\n\n")
107+
case "b", "strong":
108+
parts = append(parts, fmt.Sprintf("<b>%s</b>", ParseNodesForPassage(child)))
109+
case "i", "em":
110+
parts = append(parts, fmt.Sprintf("<i>%s</i>", ParseNodesForPassage(child)))
111+
case "h1", "h2", "h3", "h4", "h5", "h6":
112+
// Ignore "Footnotes" or "Cross references" headers
113+
headerText := ParseNodesForPassage(child)
114+
if headerText == "Footnotes" || headerText == "Cross references" {
115+
continue
116+
}
117+
parts = append(parts, fmt.Sprintf("\n\n<b>%s</b>\n", headerText))
118+
case "ul", "ol":
119+
parts = append(parts, ParseNodesForPassage(child))
120+
case "li":
121+
parts = append(parts, fmt.Sprintf("• %s\n", ParseNodesForPassage(child)))
100122
case "br":
101123
parts = append(parts, "\n")
102-
break
124+
case "div":
125+
parts = append(parts, ParseNodesForPassage(child))
103126
default:
104-
parts = append(parts, child.Data)
127+
if child.Type == html.TextNode {
128+
parts = append(parts, stdhtml.EscapeString(child.Data))
129+
} else if child.Type == html.ElementNode {
130+
// Recurse for unknown elements to preserve content
131+
parts = append(parts, ParseNodesForPassage(child))
132+
}
105133
}
106134
}
107135

108-
text = strings.Join(parts, "")
109-
110-
if node.Data == "h1" || node.Data == "h2" || node.Data == "h3" || node.Data == "h4" {
111-
text = fmt.Sprintf("*%s*", text)
112-
}
113-
return text
136+
return strings.Join(parts, "")
114137
}
115138

116139
func GetPassage(ref string, doc *html.Node, version string) string {
117-
filtNodes := utils.FilterTree(doc, func(child *html.Node) bool {
118-
switch tag := child.Data; tag {
119-
case "h1":
120-
fallthrough
121-
case "h2":
122-
fallthrough
123-
case "h3":
124-
fallthrough
125-
case "h4":
126-
if child.FirstChild.Data == "Footnotes" || child.FirstChild.Data == "Cross references" {
127-
return false
128-
}
129-
fallthrough
130-
case "p":
131-
return true
132-
}
133-
return false
134-
})
140+
// Replaced FilterTree with direct parsing of the root node
141+
// This allows handling arbitrary structure (divs, lists) returned by the API
135142

136-
textBlocks := utils.MapNodeListToString(filtNodes, ParseNodesForPassage)
143+
text := ParseNodesForPassage(doc)
137144

138145
var passage strings.Builder
139146

140147
if len(ref) > 0 {
141-
refString := fmt.Sprintf("_%s_ (%s)", ref, version)
148+
// Use HTML formatting for reference
149+
refString := fmt.Sprintf("<i>%s</i> (%s)", ref, version)
142150
passage.WriteString(refString)
143151
}
144152

145-
for _, block := range textBlocks {
146-
passage.WriteString("\n")
147-
passage.WriteString(block)
148-
}
153+
passage.WriteString("\n")
154+
passage.WriteString(strings.TrimSpace(text))
149155

150156
return passage.String()
151157
}
@@ -158,6 +164,11 @@ func ParsePassageFromHtml(ref string, rawHtml string, version string) string {
158164
return rawHtml
159165
}
160166

167+
// html.Parse returns a doc with html->body structure.
168+
// GetPassage -> ParseNodesForPassage will traverse it.
169+
// We might want to find 'body' to avoid processing 'head'?
170+
// ParseNodesForPassage iterates children. doc->html->body.
171+
// We can let it recurse.
161172
return strings.TrimSpace(GetPassage(ref, doc, version))
162173
}
163174

@@ -181,6 +192,7 @@ func GetBiblePassageFallback(env def.SessionData) def.SessionData {
181192

182193
// Attempt to get the passage
183194
env.Res.Message = GetPassage(ref, passageNode, config.Version)
195+
env.Res.ParseMode = def.TELEGRAM_PARSE_MODE_HTML
184196

185197
return env
186198
}
@@ -224,6 +236,7 @@ func GetBiblePassage(env def.SessionData) def.SessionData {
224236

225237
if len(resp.Verse) > 0 {
226238
env.Res.Message = ParsePassageFromHtml(env.Msg.Message, resp.Verse, config.Version)
239+
env.Res.ParseMode = def.TELEGRAM_PARSE_MODE_HTML
227240
return env
228241
}
229242
}

pkg/app/passage_test.go

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,10 @@ func TestGetBiblePassage(t *testing.T) {
112112
if len(env.Res.Message) < 10 {
113113
t.Errorf("Expected passage text, got '%s'", env.Res.Message)
114114
}
115+
// Verify ParseMode is set
116+
if env.Res.ParseMode != "HTML" {
117+
t.Errorf("Expected ParseMode 'HTML', got '%s'", env.Res.ParseMode)
118+
}
115119
})
116120

117121
t.Run("Empty", func(t *testing.T) {
@@ -166,29 +170,34 @@ func TestGetBiblePassage(t *testing.T) {
166170
if !strings.Contains(env.Res.Message, "In the beginning") {
167171
t.Errorf("Expected fallback passage content, got '%s'", env.Res.Message)
168172
}
173+
// Fallback should also use HTML mode
174+
if env.Res.ParseMode != "HTML" {
175+
t.Errorf("Expected ParseMode 'HTML' in fallback, got '%s'", env.Res.ParseMode)
176+
}
169177
})
170178
}
171179

172180
func TestParsePassageFromHtml(t *testing.T) {
173181
t.Run("Valid HTML with superscript", func(t *testing.T) {
174182
html := `<p><span><sup>12 </sup>But to all who did receive him, who believed in his name, he gave the right to become children of God,</span></p>`
175-
expected := `^12 ^But to all who did receive him, who believed in his name, he gave the right to become children of God,`
183+
// Updated expectation: unicode superscripts and HTML formatting
184+
expected := `¹²But to all who did receive him, who believed in his name, he gave the right to become children of God,`
176185
if got := ParsePassageFromHtml("", html, ""); got != expected {
177-
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
186+
t.Errorf("ParsePassageFromHtml() = %s, want %s", got, expected)
178187
}
179188
})
180189

181190
t.Run("HTML with italics", func(t *testing.T) {
182191
html := `<p><i>This is italic.</i></p>`
183-
expected := `_This is italic._`
192+
expected := `<i>This is italic.</i>`
184193
if got := ParsePassageFromHtml("", html, ""); got != expected {
185194
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
186195
}
187196
})
188197

189198
t.Run("HTML with bold", func(t *testing.T) {
190199
html := `<p><b>This is bold.</b></p>`
191-
expected := `*This is bold.*`
200+
expected := `<b>This is bold.</b>`
192201
if got := ParsePassageFromHtml("", html, ""); got != expected {
193202
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
194203
}
@@ -228,21 +237,38 @@ func TestParsePassageFromHtml(t *testing.T) {
228237

229238
t.Run("Nested HTML tags", func(t *testing.T) {
230239
html := `<p><b>This is bold, <i>and this is italic.</i></b></p>`
231-
expected := `*This is bold, _and this is italic._*`
240+
expected := `<b>This is bold, <i>and this is italic.</i></b>`
232241
if got := ParsePassageFromHtml("", html, ""); got != expected {
233242
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
234243
}
235244
})
236245

237-
t.Run("MarkdownV2 escaping", func(t *testing.T) {
238-
// Note: We no longer escape explicitly in ParsePassageFromHtml as we rely on the platform
239-
// to handle it later (via PostTelegram).
240-
// However, returning raw characters like * might cause issues if not handled by platform.
241-
// For now, we expect them to be returned raw.
242-
html := `<p>This has special characters: *_. [hello](world)!</p>`
243-
expected := `This has special characters: *_. [hello](world)!`
246+
t.Run("Lists", func(t *testing.T) {
247+
html := `<ul><li>Item 1</li><li>Item 2</li></ul>`
248+
// Note: The ParseNodesForPassage appends newline after each Item.
249+
// strings.TrimSpace removes the last newline.
250+
// Item 1\nItem 2\n -> Item 1\nItem 2
251+
expected := "• Item 1\n• Item 2"
244252
if got := ParsePassageFromHtml("", html, ""); got != expected {
245-
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
253+
t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
254+
}
255+
})
256+
257+
t.Run("Headers", func(t *testing.T) {
258+
html := `<h1>Header</h1>`
259+
// Code: \n\n<b>Header</b>\n
260+
// TrimSpace -> <b>Header</b>
261+
expected := "<b>Header</b>"
262+
if got := ParsePassageFromHtml("", html, ""); got != expected {
263+
t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
264+
}
265+
})
266+
267+
t.Run("Divs and escaping", func(t *testing.T) {
268+
html := `<div>Text &lt;with&gt; symbols</div>`
269+
expected := "Text &lt;with&gt; symbols"
270+
if got := ParsePassageFromHtml("", html, ""); got != expected {
271+
t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
246272
}
247273
})
248274
}

0 commit comments

Comments
 (0)