Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 243 additions & 0 deletions TeXmacs/plugins/html/progs/data/html.scm
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,222 @@
;; Html
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(define html-detected-limit 1000)

;; 按行分割文本
(define (html-string-split-lines s)
(let ((len (if (>= (string-length s) html-detected-limit) html-detected-limit (string-length s))))
(let loop ((i 0)
(start 0)
(result '()))
(cond ((>= i len)
(reverse (cons (substring s start i) result)))
((char=? (string-ref s i) #\newline)
(loop (+ i 1)
(+ i 1)
(cons (substring s start i) result)))
(else (loop (+ i 1) start result))))))

;; 某个字符在文本中的含量
(define (character-from-string s ch)
(if (not (string-null? s))
(let* ((len (string-length s))
(limit (if (>= len html-detected-limit) html-detected-limit len)))
(let loop ((ref 0)
(count 0))
(if (>= ref limit)
(/ count len)
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logic error in density calculation. The function counts characters in the first 1000 characters (limit) but divides by the total string length (len). This produces incorrect density values for strings longer than 1000 characters. The division should use 'limit' instead of 'len' to accurately represent the density of the sampled portion.

Suggested change
(/ count len)
(/ count limit)

Copilot uses AI. Check for mistakes.
(loop (+ ref 1)
(if (char=? (string-ref s ref) ch)
(+ count 1)
count)))))
#f))

;; 计算一个子串在文本中的含量,计算的是子串的字符数,而不是个数
(define (html-string-count-substring s sub)
(let ((sub-len (string-length sub)))
(if (zero? sub-len)
0
(let loop ((i 0)
(count 0))
(if (>= i (- (string-length s) sub-len -1))
count
(if (string=? (substring s i (+ i sub-len)) sub)
(loop (+ i sub-len) (+ count 1))
(loop (+ i 1) count)))))))

;; < 和 > 的含量
(define (html-angle-bracket-density s)
(if (string-null? s)
0
(let* ((len (string-length s))
(limit (if (>= len html-detected-limit) html-detected-limit len))
(substr (substring s 0 limit)))
(/ (+ (character-from-string substr #\<)
(character-from-string substr #\>))
len))))

;; 完整的tag子串在文本中的字符含量
(define (html-tag-density s)
(if (string-null? s)
0
(let* ((len (string-length s))
(limit (if (>= len html-detected-limit) html-detected-limit len))
(substr (substring s 0 limit))
(lc-substr (string-downcase substr)))
(let ((count (+ (html-string-count-substring lc-substr "<div")
(html-string-count-substring lc-substr "<span")
(html-string-count-substring lc-substr "<p")
(html-string-count-substring lc-substr "<a")
(html-string-count-substring lc-substr "<img")
(html-string-count-substring lc-substr "<ul")
(html-string-count-substring lc-substr "<ol")
(html-string-count-substring lc-substr "<li")
(html-string-count-substring lc-substr "<table")
(html-string-count-substring lc-substr "<tr")
(html-string-count-substring lc-substr "<td")
(html-string-count-substring lc-substr "<th")
(html-string-count-substring lc-substr "<h1")
(html-string-count-substring lc-substr "<h2")
(html-string-count-substring lc-substr "<h3")
(html-string-count-substring lc-substr "<h4")
(html-string-count-substring lc-substr "<h5")
(html-string-count-substring lc-substr "<h6")
(html-string-count-substring lc-substr "<form")
(html-string-count-substring lc-substr "<input")
(html-string-count-substring lc-substr "<button")
(html-string-count-substring lc-substr "<textarea")
(html-string-count-substring lc-substr "<select")
(html-string-count-substring lc-substr "<option")
(html-string-count-substring lc-substr "<style")
(html-string-count-substring lc-substr "<script")
(html-string-count-substring lc-substr "<meta")
(html-string-count-substring lc-substr "<link")
(html-string-count-substring lc-substr "</div")
(html-string-count-substring lc-substr "</ul")
(html-string-count-substring lc-substr "</ol")
(html-string-count-substring lc-substr "</table")
(html-string-count-substring lc-substr "</tr")
(html-string-count-substring lc-substr "</form")
(html-string-count-substring lc-substr "</style")
(html-string-count-substring lc-substr "</script"))))
(/ count len)))))
Comment on lines +83 to +119
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Performance concern: This function performs multiple linear scans of the same string, calling html-string-count-substring 34 times. Each call scans the entire substring. For better performance, consider combining these checks into a single pass through the string, using a state machine or regex pattern matching to identify all tag types in one scan.

Suggested change
(let ((count (+ (html-string-count-substring lc-substr "<div")
(html-string-count-substring lc-substr "<span")
(html-string-count-substring lc-substr "<p")
(html-string-count-substring lc-substr "<a")
(html-string-count-substring lc-substr "<img")
(html-string-count-substring lc-substr "<ul")
(html-string-count-substring lc-substr "<ol")
(html-string-count-substring lc-substr "<li")
(html-string-count-substring lc-substr "<table")
(html-string-count-substring lc-substr "<tr")
(html-string-count-substring lc-substr "<td")
(html-string-count-substring lc-substr "<th")
(html-string-count-substring lc-substr "<h1")
(html-string-count-substring lc-substr "<h2")
(html-string-count-substring lc-substr "<h3")
(html-string-count-substring lc-substr "<h4")
(html-string-count-substring lc-substr "<h5")
(html-string-count-substring lc-substr "<h6")
(html-string-count-substring lc-substr "<form")
(html-string-count-substring lc-substr "<input")
(html-string-count-substring lc-substr "<button")
(html-string-count-substring lc-substr "<textarea")
(html-string-count-substring lc-substr "<select")
(html-string-count-substring lc-substr "<option")
(html-string-count-substring lc-substr "<style")
(html-string-count-substring lc-substr "<script")
(html-string-count-substring lc-substr "<meta")
(html-string-count-substring lc-substr "<link")
(html-string-count-substring lc-substr "</div")
(html-string-count-substring lc-substr "</ul")
(html-string-count-substring lc-substr "</ol")
(html-string-count-substring lc-substr "</table")
(html-string-count-substring lc-substr "</tr")
(html-string-count-substring lc-substr "</form")
(html-string-count-substring lc-substr "</style")
(html-string-count-substring lc-substr "</script"))))
(/ count len)))))
(letrec* ((string-prefix-at?
(lambda (s prefix idx)
(let* ((s-len (string-length s))
(p-len (string-length prefix)))
(if (> (+ idx p-len) s-len)
#f
(let loop ((j 0))
(if (= j p-len)
#t
(if (char=? (string-ref s (+ idx j))
(string-ref prefix j))
(loop (+ j 1))
#f)))))))
(tags '("<div"
"<span"
"<p"
"<a"
"<img"
"<ul"
"<ol"
"<li"
"<table"
"<tr"
"<td"
"<th"
"<h1"
"<h2"
"<h3"
"<h4"
"<h5"
"<h6"
"<form"
"<input"
"<button"
"<textarea"
"<select"
"<option"
"<style"
"<script"
"<meta"
"<link"
"</div"
"</ul"
"</ol"
"</table"
"</tr"
"</form"
"</style"
"</script")))
(substr-len (string-length lc-substr)))
(let loop ((i 0) (count 0))
(if (>= i substr-len)
(/ count len)
(let ((new-count
(let tag-loop ((ts tags) (c count))
(if (null? ts)
c
(if (string-prefix-at? lc-substr (car ts) i)
(tag-loop (cdr ts) (+ c 1))
(tag-loop (cdr ts) c))))))
(loop (+ i 1) new-count)))))))

Copilot uses AI. Check for mistakes.

;; = 和 " 的含量
(define (html-attribute-density s)
(if (string-null? s)
0
(let* ((len (string-length s))
(limit (if (>= len html-detected-limit) html-detected-limit len))
(substr (substring s 0 limit)))
(/ (+ (character-from-string substr #\=)
(character-from-string substr #\"))
len))))

;; 这一行文本是否包含html标签
(define (html-line-contains-features? line)
(let ((lc-line (string-downcase line)))
(or
(> (html-string-count-substring lc-line "<div") 0)
(> (html-string-count-substring lc-line "<span") 0)
(> (html-string-count-substring lc-line "<p") 0)
(> (html-string-count-substring lc-line "<a") 0)
(> (html-string-count-substring lc-line "<img") 0)
(> (html-string-count-substring lc-line "<ul") 0)
(> (html-string-count-substring lc-line "<ol") 0)
(> (html-string-count-substring lc-line "<li") 0)
(> (html-string-count-substring lc-line "<table") 0)
(> (html-string-count-substring lc-line "<tr") 0)
(> (html-string-count-substring lc-line "<td") 0)
(> (html-string-count-substring lc-line "<th") 0)
(> (html-string-count-substring lc-line "<h1") 0)
(> (html-string-count-substring lc-line "<h2") 0)
(> (html-string-count-substring lc-line "<h3") 0)
(> (html-string-count-substring lc-line "<h4") 0)
(> (html-string-count-substring lc-line "<h5") 0)
(> (html-string-count-substring lc-line "<h6") 0)
(> (html-string-count-substring lc-line "</div") 0)
(> (html-string-count-substring lc-line "</span") 0)
(> (html-string-count-substring lc-line "</p") 0)
(> (html-string-count-substring lc-line "</a") 0)
(> (html-string-count-substring lc-line "/>") 0)
(> (html-string-count-substring lc-line "<!doctype") 0)
(> (html-string-count-substring lc-line "<?xml") 0))))
Comment on lines +132 to +160
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Performance concern: This function performs multiple linear scans of the same string, calling html-string-count-substring 25 times. Each call scans the entire line. For better performance, consider combining these checks into a single pass through the line, or using a more efficient pattern matching approach.

Suggested change
;; 这一行文本是否包含html标签
(define (html-line-contains-features? line)
(let ((lc-line (string-downcase line)))
(or
(> (html-string-count-substring lc-line "<div") 0)
(> (html-string-count-substring lc-line "<span") 0)
(> (html-string-count-substring lc-line "<p") 0)
(> (html-string-count-substring lc-line "<a") 0)
(> (html-string-count-substring lc-line "<img") 0)
(> (html-string-count-substring lc-line "<ul") 0)
(> (html-string-count-substring lc-line "<ol") 0)
(> (html-string-count-substring lc-line "<li") 0)
(> (html-string-count-substring lc-line "<table") 0)
(> (html-string-count-substring lc-line "<tr") 0)
(> (html-string-count-substring lc-line "<td") 0)
(> (html-string-count-substring lc-line "<th") 0)
(> (html-string-count-substring lc-line "<h1") 0)
(> (html-string-count-substring lc-line "<h2") 0)
(> (html-string-count-substring lc-line "<h3") 0)
(> (html-string-count-substring lc-line "<h4") 0)
(> (html-string-count-substring lc-line "<h5") 0)
(> (html-string-count-substring lc-line "<h6") 0)
(> (html-string-count-substring lc-line "</div") 0)
(> (html-string-count-substring lc-line "</span") 0)
(> (html-string-count-substring lc-line "</p") 0)
(> (html-string-count-substring lc-line "</a") 0)
(> (html-string-count-substring lc-line "/>") 0)
(> (html-string-count-substring lc-line "<!doctype") 0)
(> (html-string-count-substring lc-line "<?xml") 0))))
;; Helper: check whether STR has PREFIX starting at position POS
(define (string-prefix-at? str prefix pos)
(let* ((len-str (string-length str))
(len-pre (string-length prefix))
(end (+ pos len-pre)))
(and (<= end len-str)
(string=? (substring str pos end) prefix))))
;; 这一行文本是否包含html标签
(define (html-line-contains-features? line)
(let* ((lc-line (string-downcase line))
(len (string-length lc-line)))
(let loop ((i 0))
(cond
((>= i len) #f)
(else
(let ((c (string-ref lc-line i)))
(cond
;; Check for patterns starting with '<'
((char=? c #\<)
(if (or (string-prefix-at? lc-line "<div" i)
(string-prefix-at? lc-line "<span" i)
(string-prefix-at? lc-line "<p" i)
(string-prefix-at? lc-line "<a" i)
(string-prefix-at? lc-line "<img" i)
(string-prefix-at? lc-line "<ul" i)
(string-prefix-at? lc-line "<ol" i)
(string-prefix-at? lc-line "<li" i)
(string-prefix-at? lc-line "<table" i)
(string-prefix-at? lc-line "<tr" i)
(string-prefix-at? lc-line "<td" i)
(string-prefix-at? lc-line "<th" i)
(string-prefix-at? lc-line "<h1" i)
(string-prefix-at? lc-line "<h2" i)
(string-prefix-at? lc-line "<h3" i)
(string-prefix-at? lc-line "<h4" i)
(string-prefix-at? lc-line "<h5" i)
(string-prefix-at? lc-line "<h6" i)
(string-prefix-at? lc-line "</div" i)
(string-prefix-at? lc-line "</span" i)
(string-prefix-at? lc-line "</p" i)
(string-prefix-at? lc-line "</a" i)
(string-prefix-at? lc-line "<!doctype" i)
(string-prefix-at? lc-line "<?xml" i))
#t
(loop (+ i 1))))
;; Preserve detection of "/>" anywhere in the line
((and (char=? c #\/)
(< (+ i 1) len)
(char=? (string-ref lc-line (+ i 1)) #\>))
#t)
(else
(loop (+ i 1)))))))))

Copilot uses AI. Check for mistakes.

;; 计算存在html特征的行的含量
(define (html-feature-line-density s)
(let ((lines (html-string-split-lines s)))
(if (null? lines)
0
(let loop ((remaining lines)
(count 0)
(total 0))
(if (null? remaining)
(if (> total 0) (/ count total) 0)
(let ((line (car remaining)))
(loop (cdr remaining)
(if (html-line-contains-features? line) (+ count 1) count)
(+ total 1))))))))

;; 计算div标签的平衡性
(define (html-structure-balanced? s)
(let* ((lc-s (string-downcase s))
(open-tags (html-string-count-substring lc-s "<div"))
(close-tags (html-string-count-substring lc-s "</div")))
;; div 的开标签与闭标签数量差小于2
(and (> open-tags 0) (> close-tags 0) (<= (abs (- open-tags close-tags)) 2))))

;; 短字符串的特殊检测
(define (determine-short-html-string s)
(let* ((len (string-length s)))
(cond
((or
(and (> (character-from-string s #\<) 0)
(> (character-from-string s #\>) 0)
(> (html-string-count-substring s "</") 0))
(> (html-string-count-substring (string-downcase s) "class=") 0)
(> (html-string-count-substring (string-downcase s) "id=") 0)
(> (html-string-count-substring (string-downcase s) "style=") 0)
(> (html-string-count-substring (string-downcase s) "href=") 0)
(> (html-string-count-substring (string-downcase s) "src=") 0))
#t)
((>= (html-angle-bracket-density s) 0.03) #t)
(else #f))))

(define (is-short-html-string? s)
(if (<= (string-length s) 100)
(determine-short-html-string s)
#f))

(define (is-html-string? s)
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent indentation. This line has a leading space before the opening parenthesis, while all other function definitions in the file start at column 1. Remove the leading space for consistency.

Suggested change
(define (is-html-string? s)
(define (is-html-string? s)

Copilot uses AI. Check for mistakes.
(let* ((angle-density (html-angle-bracket-density s))
(tag-density (html-tag-density s))
(attr-density (html-attribute-density s))
(feature-line-density (html-feature-line-density s))
(balanced? (html-structure-balanced? s)))
(cond
;; High confidence: clear HTML structure
;; < > 含量,标签含量,特征行含量
((and (>= angle-density 0.02)
(>= tag-density 0.01)
(>= feature-line-density 0.25))
#t)
;; Medium confidence: good angle bracket density with either tags or attributes
;;
((and (>= angle-density 0.015)
(or (>= tag-density 0.005)
(>= attr-density 0.01))
(>= feature-line-density 0.15))
#t)
;; Lower confidence: balanced structure with some HTML features
((and balanced?
(>= angle-density 0.01)
(>= feature-line-density 0.10))
#t)
;; Very high angle bracket density (likely HTML/XML)
((>= angle-density 0.03) #t)
(else #f))))

(define (html-recognizes-at? s pos)
(set! pos (format-skip-spaces s pos))
(cond ((format-test? s pos "<html") #t)
Expand All @@ -26,10 +242,37 @@
((format-test? s pos "<!doctype html") #t)
((format-test? s pos "<math") #t)
((format-test? s pos "<table") #t)
((format-test? s pos "<p>") #t)
((format-test? s pos "<div") #t)
((format-test? s pos "<span") #t)
((format-test? s pos "<a ") #t)
((format-test? s pos "<img") #t)
((format-test? s pos "<ul") #t)
((format-test? s pos "<ol") #t)
((format-test? s pos "<li") #t)
((format-test? s pos "<h1") #t)
((format-test? s pos "<h2") #t)
((format-test? s pos "<h3") #t)
((format-test? s pos "<h4") #t)
((format-test? s pos "<h5") #t)
((format-test? s pos "<h6") #t)
((format-test? s pos "<form") #t)
((format-test? s pos "<input") #t)
((format-test? s pos "<button") #t)
((format-test? s pos "<textarea") #t)
((format-test? s pos "<select") #t)
((format-test? s pos "<option") #t)
((format-test? s pos "<style") #t)
((format-test? s pos "<script") #t)
((format-test? s pos "<meta") #t)
((format-test? s pos "<link") #t)
((format-test? s pos "<!--") #t)
((format-test? s pos "<?xml ")
(html-recognizes-at? s (format-skip-line s pos)))
((format-test? s pos "<!doctype ")
(html-recognizes-at? s (format-skip-line s pos)))
((is-short-html-string? s) #t)
((is-html-string? s) #t)
(else #f)))

(define (html-recognizes? s)
Expand Down
Loading
Loading