From 43aa17f03cacd8ef68745f9d369e6ac1c000131e Mon Sep 17 00:00:00 2001 From: Jody Ivan Lumbantoruan Date: Sat, 21 Mar 2026 03:47:01 +0700 Subject: [PATCH 1/3] chore(lab): search proptyping --- .gitignore | 6 +- cmd/lab/search/main.go | 401 ++++++++++++++++++++++++++++++++++ cmd/lab/search/query/main.go | 70 ++++++ cmd/lab/search/usage/usage.go | 209 ++++++++++++++++++ files/index/.gitkeep | 1 + go.mod | 36 ++- go.sum | 76 ++++++- 7 files changed, 787 insertions(+), 12 deletions(-) create mode 100644 cmd/lab/search/main.go create mode 100644 cmd/lab/search/query/main.go create mode 100644 cmd/lab/search/usage/usage.go create mode 100644 files/index/.gitkeep diff --git a/.gitignore b/.gitignore index 69faa4d..2b71101 100644 --- a/.gitignore +++ b/.gitignore @@ -40,4 +40,8 @@ mscz/ # db database *.db -*.zip \ No newline at end of file +*.zip + + +# Output of bleve indexing +**/*.bleve/ \ No newline at end of file diff --git a/cmd/lab/search/main.go b/cmd/lab/search/main.go new file mode 100644 index 0000000..c22950d --- /dev/null +++ b/cmd/lab/search/main.go @@ -0,0 +1,401 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "log" + "regexp" + "strconv" + "strings" + "unicode" + + "github.com/blevesearch/bleve/v2" + "github.com/jodi-ivan/numbered-notation-xml/cmd/lab/search/usage" + "github.com/jodi-ivan/numbered-notation-xml/internal/lyric" + "github.com/jodi-ivan/numbered-notation-xml/internal/musicxml" + "github.com/jodi-ivan/numbered-notation-xml/svc/repository" + "github.com/jodi-ivan/numbered-notation-xml/utils/storage" +) + +var originalTitleExp *regexp.Regexp + +func init() { + if originalTitleExp == nil { + originalTitleExp = regexp.MustCompile(".*") + } +} + +type VerseDoc struct { + No int `json:"no"` + Content string `json:"content"` + Start int `json:"start"` + End int `json:"end"` +} +type Document struct { + ID string `json:"id"` + Title string `json:"title"` + Content string `json:"content"` + Catergory []usage.Entry `json:"categories,omitempty"` + HymnNo int `json:"hymn_no"` + Variant string `json:"variant,omitempty"` + Verses []VerseDoc `json:"verses"` + OriginalTitle []string `json:"original_title"` + BE int `json:"be_num,omitempty"` + NR int `json:"nr_num,omitempty"` + ForKids bool `json:"for_kids"` + Copyright string `json:"copyright,omitempty"` + MusicCredit string `json:"music"` +} + +var categories = map[string]map[string][2]int{ + "Menghadap Allah": map[string][2]int{ + "Puji-pujian dan Pembukaan Ibadah": [2]int{1, 22}, + "Pengakuan dan Pengampunan Dosa": [2]int{23, 41}, + "Kyrie dan Gloria": [2]int{42, 48}, + }, + "Pelayanan Firman": map[string][2]int{ + "Pembacaan Alkitab": [2]int{49, 59}, + "Penciptaan dan Pemeliharaan": [2]int{60, 69}, + "Pejanjian Lama": [2]int{70, 75}, + "Penantian Mesias dan Masa Adven": [2]int{76, 91}, + "Kelahiran Yesus dan Masa Natal": [2]int{92, 127}, + "Akhir Masa Natal dan Epifania": [2]int{128, 143}, + "Kisah Pelayanan Yesus": [2]int{144, 154}, + "Masa Prapaskah": [2]int{155, 163}, + "Sengsara Yesus dan Jumat Agung": [2]int{164, 186}, + "Kebangkitan Yesus dan Masa Paskah": [2]int{187, 217}, + "Hari Kenaikan": [2]int{218, 227}, + "Roh Kudus dan Hari Pentakosta": [2]int{228, 241}, + "Allah Tritunggal dan Hari Trinitatis": [2]int{242, 246}, + "Gereja dan Kerajaan Allah": [2]int{247, 261}, + "Kehidupan Sorgawi": [2]int{262, 271}, + "Akhir Zaman dan Penggenapan": [2]int{272, 279}, + }, + "Respons Terhadap Pelayanan Firman": map[string][2]int{ + "Pernyataan Keyakinan Iman": [2]int{280, 285}, + "Pengucapan Syukur dan Persembahan": [2]int{286, 303}, + }, + "Pelayanan Khusus": map[string][2]int{ + "Baptisan Kudus dan Peneguhan Sidi": [2]int{304, 309}, + "Perjamuan Kudus": [2]int{310, 315}, + "Pernikahan": [2]int{316, 318}, + "Peristiwa Isimewa Gerejawi": [2]int{319, 320}, + }, + "Waktu dan Musim": map[string][2]int{ + "Pagi dan Siang": [2]int{321, 323}, + "Petang dan Malam": [2]int{323, 329}, + "Pergantian Tahun": [2]int{330, 332}, + "Musim dan Panen": [2]int{333, 335}, + "Bangsa dan Negara": [2]int{336, 337}, + }, + "Penutupan Ibadah": map[string][2]int{ + "Pungutusan": [2]int{338, 344}, + "Berkat": [2]int{345, 350}, + }, + "Hidup Beriman Sehari-hari": map[string][2]int{ + "Panggilan Juruselamat": [2]int{351, 360}, + "Penyerahan Diri": [2]int{361, 376}, + "Kebesaran Rahmat Tuhan": [2]int{377, 390}, + "Sukacita dalam Tuhan": [2]int{391, 399}, + "Hidup Bersama Tuhan": [2]int{400, 405}, + "Tuntunan Tuhan": [2]int{406, 421}, + "Tanggung Jawab Pengikut Kristus": [2]int{422, 437}, + "Kemenangan dalam Perjuangan": [2]int{438, 446}, + "Keluarga dan Persekutuan": [2]int{447, 451}, + "Doa dan Setiap Waktu": [2]int{452, 471}, + }, + "Haleluya, Amin dan Lain-lain": map[string][2]int{ + "": [2]int{472, 478}, + }, +} + +func GetCategory(num int) (string, string) { + + for cat, subs := range categories { + for subcat, ranges := range subs { + if num >= ranges[0] && num <= ranges[1] { + return cat, subcat + } + } + } + + return "", "" + +} + +func GetOriginalTitle(metadata *repository.HymnMetadata) []string { + if strings.Contains(metadata.Lyric, "") { + submatch := originalTitleExp.FindStringSubmatch(metadata.Lyric) + if len(submatch) > 0 && len(submatch[0]) > 6 { + stripped := submatch[0][3 : len(submatch[0])-4] + strings.TrimSuffix(stripped, ",") + return strings.Split(stripped, "/") + } + } + + return []string{metadata.Title} +} + +func BuildContent(repo repository.Repository, metadata *repository.HymnMetadata, verse int) string { + if verse == 1 { + path := fmt.Sprintf("files/scores/musicxml/kj-%03d.musicxml", metadata.Number) + if metadata.Variant.String != "" { + path = fmt.Sprintf("files/scores/musicxml/kj-%03d%s.musicxml", metadata.Number, metadata.Variant.String) + } + xmls, err := repo.GetMusicXML(context.Background(), path) + if err != nil { + log.Println("failed to get the xml data, err:", err.Error()) + return "" + } + + lMapper := map[int]string{} + prevTotalLyric := -1 + for _, measure := range xmls.Part.Measures { + measure.Build() + for _, note := range measure.Notes { + if len(note.Lyric) == 0 { + continue + } + + if prevTotalLyric != len(note.Lyric) && prevTotalLyric != -1 { + lMapper[1] += lMapper[2] + lMapper[2] = "" + } + + for _, l := range note.Lyric { + syl := "" + + for _, s := range l.Text { + syl += s.Value + } + // li := lyric.NewLyric() + if unicode.IsDigit(rune(syl[0])) { + syl = syl[2:] + } + lMapper[l.Number] += syl + + if l.Syllabic == musicxml.LyricSyllabicTypeEnd || l.Syllabic == musicxml.LyricSyllabicTypeSingle { + lMapper[l.Number] += " " + } + + } + + prevTotalLyric = len(note.Lyric) + } + } + + for part := 2; part <= 4; part++ { + if lMapper[part] != "" { + lMapper[1] += lMapper[part] + " " + } + } + + return lMapper[1] + + } + + result := "" + + if _, ok := metadata.Verse[verse]; !ok { + return "" + } + whole := [][]lyric.LyricWordVerse{} + + err := json.Unmarshal([]byte(metadata.Verse[verse].Content.String), &whole) + if err != nil { + log.Println("[RenderVerse] failed to unmarshal, err ", err) + } + + for _, line := range whole { + for _, word := range line { + result += word.Word + " " + } + } + + return result +} + +func buildContentWithOffsets(verses []VerseDoc) (string, []VerseDoc) { + var sb strings.Builder + for i := range verses { + verses[i].Start = sb.Len() + sb.WriteString(verses[i].Content) + verses[i].End = sb.Len() + // if i < len(verses)-1 { + // sb.WriteString(" ") + // } + } + return sb.String(), verses +} + +func BuildDocument(repo repository.Repository, num int, vaiant ...string) (*Document, error) { + // get hymn meta data + + metadata, err := repo.GetHymnMetaData(context.Background(), num, vaiant...) + if err != nil { + return nil, err + } + + cats := map[string][]string{} + + doc := &Document{ + ID: strconv.Itoa(metadata.HymnID), + Title: metadata.Title, + HymnNo: metadata.Number, + BE: int(metadata.RefBE.Int16), + NR: int(metadata.RefNR.Int16), + Copyright: metadata.Copyright.String, + MusicCredit: metadata.Music, + OriginalTitle: GetOriginalTitle(metadata), + Catergory: []usage.Entry{}, + } + + if len(vaiant) > 0 { + doc.Variant = vaiant[0] + doc.ID += vaiant[0] + } + verses := make([]VerseDoc, len(metadata.Verse)+1) + for verse := 1; verse <= len(metadata.Verse)+1; verse++ { + currCats := usage.Lookup(num, verse) + for _, curr := range currCats { + if cats[curr.H1] == nil { + cats[curr.H1] = []string{} + } + + cats[curr.H1] = append(cats[curr.H1], curr.H2) + } + verses[verse-1] = VerseDoc{No: verse, Content: BuildContent(repo, metadata, verse)} + } + + h2s := map[string]bool{} + for h1, cat := range cats { + for _, h2 := range cat { + if h2s[h2] { + continue + } + h2s[h2] = true + doc.Catergory = append(doc.Catergory, usage.Entry{H1: h1, H2: h2}) + } + } + + doc.Content, verses = buildContentWithOffsets(verses) + doc.Verses = verses + + return doc, nil + +} + +func main() { + + db, err := storage.NewStorage(context.Background(), "files/database/kidung-jemaat.db") + if err != nil { + log.Fatalf("Failed to connect to storage: %s", err.Error()) + return + } + + repo := repository.New(context.Background(), db) + + indexPath := "files/index/kj.bleve" + // Create a new index + indexMapping := bleve.NewIndexMapping() + index, err := bleve.New(indexPath, indexMapping) + if err != nil { + log.Fatal(err) + } + defer index.Close() + + docMapping := bleve.NewDocumentMapping() + + contentMapping := bleve.NewTextFieldMapping() + contentMapping.Store = true + contentMapping.IncludeTermVectors = true + docMapping.AddFieldMappingsAt("content", contentMapping) + + versesMapping := bleve.NewDocumentMapping() + + noMapping := bleve.NewNumericFieldMapping() + noMapping.Store = true + + verseContentMapping := bleve.NewTextFieldMapping() + verseContentMapping.Store = true + + startMapping := bleve.NewNumericFieldMapping() + startMapping.Store = true + + endMapping := bleve.NewNumericFieldMapping() + endMapping.Store = true + + variantMapping := bleve.NewTextFieldMapping() + variantMapping.Store = true + + hymnMap := bleve.NewNumericFieldMapping() + hymnMap.Store = true + + titleMap := bleve.NewNumericFieldMapping() + titleMap.Store = true + + versesMapping.AddFieldMappingsAt("no", noMapping) + versesMapping.AddFieldMappingsAt("content", verseContentMapping) + versesMapping.AddFieldMappingsAt("start", startMapping) + versesMapping.AddFieldMappingsAt("end", endMapping) + docMapping.AddSubDocumentMapping("verses", versesMapping) + docMapping.AddFieldMappingsAt("variant", variantMapping) + docMapping.AddFieldMappingsAt("hymn_no", hymnMap) + docMapping.AddFieldMappingsAt("title", titleMap) + + indexMapping.DefaultMapping = docMapping + + var variants = map[int][]string{ + 24: []string{"a", "b"}, + 30: []string{"a", "b"}, + 31: []string{"a", "b"}, + 37: []string{"a", "b"}, + 50: []string{"a", "b"}, + 95: []string{"a", "b"}, + 144: []string{"a", "b"}, + 146: []string{"a", "b"}, + 168: []string{"a", "b", "c"}, + 174: []string{"a", "b"}, + } + // Add documents + documents := []*Document{} + + for no := 1; no <= 25; no++ { + var doc *Document + var err error + if variant, ok := variants[no]; ok { + for _, vs := range variant { + doc, err = BuildDocument(repo, no, vs) + if err != nil { + if err != nil { + log.Fatalf("Failed to build docs: %s", err.Error()) + return + } + } + + raw, _ := json.MarshalIndent(doc, "", " ") + log.Println(string(raw)) + } + } else { + doc, err = BuildDocument(repo, no) + if err != nil { + log.Fatalf("Failed to build docs: %s", err.Error()) + return + } + } + + documents = append(documents, doc) + } + + // Iterate and index the documents + batch := index.NewBatch() + for _, doc := range documents { + batch.Index(doc.ID, doc) + log.Println("indexing", doc.ID) + } + if err := index.Batch(batch); err != nil { + log.Fatal(err) + } + +} diff --git a/cmd/lab/search/query/main.go b/cmd/lab/search/query/main.go new file mode 100644 index 0000000..0515506 --- /dev/null +++ b/cmd/lab/search/query/main.go @@ -0,0 +1,70 @@ +package main + +import ( + "fmt" + "log" + + "github.com/blevesearch/bleve/v2" +) + +func main() { + + indexPath := "files/index/kj.bleve" + + // Load the existing index + index, err := bleve.Open(indexPath) + if err != nil { + // Handle error (e.g., path incorrect, index corrupted) + } + defer index.Close() + + // Search the created index + query := bleve.NewMatchQuery("sembah") + query.Fuzziness = 1 + query.Prefix = 1 + searchRequest := bleve.NewSearchRequest(query) + searchRequest.Fields = []string{"hymn_no", "title", "verses.no", "verses.content", "verses.start", "verses.end", "variant"} + searchRequest.Highlight = bleve.NewHighlight() + searchRequest.Explain = true + searchResult, err := index.Search(searchRequest) + if err != nil { + log.Fatal(err) + } + + result := map[int][]int{} + for _, hit := range searchResult.Hits { + + // starts := hit.Fields["verses.start"].([]interface{}) + // ends := hit.Fields["verses.end"].([]interface{}) + + // for i, s := range starts { + // start := int(s.(float64)) + // end := int(ends[i].(float64)) + // } + + // hit.Fields contains your stored values + // hit.Locations contains the match positions per field + log.Println("===================", hit.Fields["hymn_no"], hit.Fields["variant"], "|", hit.Fields["verses.start"], hit.Fields["verses.end"], hit.Fields["verses.no"]) + // get match positions in "content" + if locations, ok := hit.Locations["content"]; ok { + for term, termLocations := range locations { + for _, loc := range termLocations { + + fmt.Printf("term %q matched at byte offset %d\n", term, loc.Start) + hymnNo := hit.Fields["hymn_no"].(float64) + if result[int(hymnNo)] == nil { + result[int(hymnNo)] = []int{} + } + + result[int(hymnNo)] = append(result[int(hymnNo)], int(loc.Start)) + + log.Println("document ID", hit.ID) + + } + } + } + + log.Println(result) + } + +} diff --git a/cmd/lab/search/usage/usage.go b/cmd/lab/search/usage/usage.go new file mode 100644 index 0000000..f1aa00f --- /dev/null +++ b/cmd/lab/search/usage/usage.go @@ -0,0 +1,209 @@ +package usage + +import ( + "fmt" + "strconv" + "strings" +) + +// Entry represents a single hymn number (with optional verse) mapped to its categories. +type Entry struct { + H1 string `json:"category"` // top-level section (all caps) + H2 string `json:"sub_category"` // sub-category + Num int `json:"-"` + Verse int `json:"verse,omitempty"` // 0 means "any verse" / not specified +} + +// rawData holds the index as [h1, h2, comma-separated token list]. +// Tokens may be: +// - "287" → single number +// - "1-22" → inclusive range +// - "287:3" → number with specific verse +var rawData = [][3]string{ + {"MENGHADAP ALLAH", "Puji-pujian dan Pembukaan Ibadah", "1-22,47,57,60,62,64,191,242-244,246,287-288,290-292,294,295,391,452,454,456,459,464"}, + {"MENGHADAP ALLAH", "Pengakuan dan Pengampunan Dosa", "23-41,169,174,179,185,213,286,293,297,300,305,309,351-359,361,362,380,381,386-388,392,394,398,404,411,434,435,453,459,467"}, + {"MENGHADAP ALLAH", "Kyrie dan Gloria", "42-48,5,7,13,242-244,287:3,303,347"}, + {"PELAYANAN FIRMAN", "Pembacaan Alkitab", "49-59,73,145,150,228-231,233,236-238,240,423,464,473,474"}, + {"PELAYANAN FIRMAN", "Penciptaan dan Pemeliharaan", "60-69,1,3,4,10,19,243,244,287,289,291,298,334,337,385,389,449,461,467"}, + {"PELAYANAN FIRMAN", "Perjanjian Lama", "70-75,4,9,24,46,52,53,69,146,283,285,288,291,292,330,333,334,377,412"}, + {"PELAYANAN FIRMAN", "Penantian Mesias dan Masa Adven", "76-91,73,139,162,260,273,284"}, + {"PELAYANAN FIRMAN", "Kelahiran Yesus dan Masa Natal", "92-127,19,77,87,90,91,136,137,297"}, + {"PELAYANAN FIRMAN", "Akhir Masa Natal dan Epifania", "128-143,19,92,107,110,113-116,118,119,121-123,127,248,281,284,286,293,297,386"}, + {"PELAYANAN FIRMAN", "Kisah Pelayanan Yesus", "144-154,30,74,138,140,283,285,294,298,351-360,370,377,385,398,407,415,418,422,428,431,434,440,451,459,464,468"}, + {"PELAYANAN FIRMAN", "Masa Prapaskah", "155-163,24,28,32,33,46,87,91,152,174,179,286,309,314,372,375,376,381,412,430,443,463"}, + {"PELAYANAN FIRMAN", "Sengsara Yesus dan Jumat Agung", "164-186,32-34,152,156-158,160,286,311-313,368,382,394,404,435,460"}, + {"PELAYANAN FIRMAN", "Kebangkitan Yesus dan Masa Paskah", "187-217,1,5,7,19,65,72,139,152,181,222,226,242,243,246-248,250,281,283,285,286,291,292,295,323,370,373,383,386,394,397,398,404,407,415,435,443"}, + {"PELAYANAN FIRMAN", "Hari Kenaikan", "218-227,5,7,19,41,181,194,202,242,244,247,248,281,284,293,308,383,345,435"}, + {"PELAYANAN FIRMAN", "Roh Kudus dan Hari Pentakosta", "228-241,7,8,16,45,55,56,58,74,242-244,246,257,403"}, + {"PELAYANAN FIRMAN", "Allah Tritunggal dan Hari Trinitatis", "242-246,8,13,16,45,47,48,287:3,303,347,348"}, + {"PELAYANAN FIRMAN", "Gereja dan Kerajaan Allah", "247-261,1,4-7,46,72,74,88,194,213,220,222,224,226,242,243,269,282,330,339,340,345,372,391,434"}, + {"PELAYANAN FIRMAN", "Kehidupan Sorgawi", "262-271,2,5,7,72,219,221,222,224,252,282,283,285,330,355,377,398"}, + {"PELAYANAN FIRMAN", "Akhir Zaman dan Penggenapan", "272-279,5,7,72,74,220,224,225,247,248,260-270,281,282,293,323,340"}, + {"RESPONS TERHADAP PELAYANAN FIRMAN", "Pernyataan Keyakinan Iman", "280-285,5,7,38,46,72,77,194,222,247,248,250,252,305,306,308,309,314,356,367,370,374,376,377,380,383,386-388,392,394,396,405,415"}, + {"RESPONS TERHADAP PELAYANAN FIRMAN", "Pengucapan Syukur dan Persembahan", "286-303,1-12,19,60,62,65,72,77,194,220,249,258,259,291,309,341,361-365,367,393,405,424,433,437,444,450"}, + {"PELAYANAN KHUSUS", "Baptisan Kudus dan Peneguhan Sidi", "304-309,6,19,32,33,36,38,40,41,55,56,58,72,141,146,152,154,228,229,233,236-240,247-261,269,280-285,287,293,298,300,314,330,339,340,344,351,352,356,360,361-376,377,380,381,382,388,392,396,398,402,421,423,430,434,437,446,453,457,466"}, + {"PELAYANAN KHUSUS", "Perjamuan Kudus", "310-315,2,6,9,17,18,32-41,55,56,58,72,74,77,88,128,139,146,153,156,157,160,163,169,174,194,207,210,215,220,222,224,226-240,247-261,262,263,272,273,275,276,279,282,283,285,287-289,291,293,298,305,306,309,323,358,359,361-376,377,380-382,386,388,392,396,398,404,406-412,434,443,464,466,470"}, + {"PELAYANAN KHUSUS", "Pernikahan", "316-318,1,3,9,10,14,16,18,55,56,60,62,65,233,237,239,283,285,287,288,291,295,298,314,330,350,367,370,377,399,407,415,419,444,447,450,461,466"}, + {"PELAYANAN KHUSUS", "Peristiwa Istimewa Gerejawi", "319-320,1-18,22,49-56,60,72,194,213,220,222,229,233,237,239,240,242,243,247-260,272,284,287,291,292,303,314,330,338-343,367,372,376,391,405,422-437,446"}, + {"PELAYANAN KHUSUS", "Pemakaman", "32,37,46,72,128,194,202,207-210,222,261-270,274-279,282,283,285,291,300,329-331,370,372,376,377,380,388,394,412,417,438,445,453,457"}, + {"WAKTU DAN MUSIM", "Pagi dan Siang", "321-323,1-7,19,60-66,194,213,227,237,239,243,245,248,291,298,309,337,344,356,365,384,385,389,390,393,405,407,414,420,421,423,424,437,444,446-448,450,452-470"}, + {"WAKTU DAN MUSIM", "Petang dan Malam", "324-329,8,23,25,29,41,51,60,68,86,148,227,245,282,286,290,291,309,345,383,384,388,389,393,398,405,406,410,411,417,420,422,440-442,445,451-470"}, + {"WAKTU DAN MUSIM", "Pergantian Tahun", "330-335,1-16,23,47,60,72,121,136,138,220,242-246,248,250,255,260,283,285,286-291,298,305,321,328,329,337,343,345,365,370,377,379,383,393,405,406-421,440-442,445,453,457,461,466"}, + {"WAKTU DAN MUSIM", "Musim dan Panen", "333-335,1,3,4,10,19,60-66,74,243,244,287,289-291,295,298,299,302,303,322,337,444,449,461,469,470"}, + {"WAKTU DAN MUSIM", "Bangsa dan Negara", "336-337,1,3,9,15,60,67,260,287,289,291,292,295,298,299,322,330,334,399,444,461,470"}, + {"PENUTUPAN IBADAH", "Pengutusan", "338-344,49,50,128,163,231,243,244,247-260,281,287:3,303,329,345,346,365,372,374-376,402,405-408,414,421-438,440,448,450,457,458,462,466,470"}, + {"PENUTUPAN IBADAH", "Berkat", "345-350,5:7,7:4,478"}, + {"PENUTUPAN IBADAH", "Haleluya, Amin dan Lain-lain", "417-477,1,46,193,205,243,349"}, +} + +var masterIndex []Entry + +func init() { + if masterIndex == nil { + masterIndex = buildIndex() + } +} + +// buildIndex parses rawData into a slice of Entry. +func buildIndex() []Entry { + var entries []Entry + for _, row := range rawData { + h1, h2, tokens := row[0], row[1], row[2] + for _, tok := range strings.Split(tokens, ",") { + tok = strings.TrimSpace(tok) + if tok == "" { + continue + } + if strings.Contains(tok, ":") { + // e.g. "287:3" + parts := strings.SplitN(tok, ":", 2) + num, err1 := strconv.Atoi(parts[0]) + verse, err2 := strconv.Atoi(parts[1]) + if err1 == nil && err2 == nil { + entries = append(entries, Entry{h1, h2, num, verse}) + } + } else if strings.Contains(tok, "-") { + // e.g. "1-22" + parts := strings.SplitN(tok, "-", 2) + a, err1 := strconv.Atoi(parts[0]) + b, err2 := strconv.Atoi(parts[1]) + if err1 == nil && err2 == nil { + for n := a; n <= b; n++ { + entries = append(entries, Entry{h1, h2, n, 0}) + } + } + } else { + num, err := strconv.Atoi(tok) + if err == nil { + entries = append(entries, Entry{h1, h2, num, 0}) + } + } + } + } + return entries +} + +// lookup finds all categories for a given hymn number and optional verse. +// verse == 0 means "no filter" — returns all matches regardless of verse. +func Lookup(num, verse int) []Entry { + seen := map[string]bool{} + var results []Entry + for _, e := range masterIndex { + // number must match + if e.Num != num { + continue + } + // if entry has a specific verse AND caller specified a verse, they must match + if e.Verse != 0 && verse != 0 && e.Verse != verse { + continue + } + key := fmt.Sprintf("%s|%s|%d", e.H1, e.H2, e.Verse) + if !seen[key] { + seen[key] = true + results = append(results, e) + } + } + return results +} + +// func printResults(results []Entry, num, verse int) { +// label := strconv.Itoa(num) +// if verse != 0 { +// label += fmt.Sprintf(":%d", verse) +// } +// if len(results) == 0 { +// fmt.Printf("No categories found for hymn %s.\n", label) +// return +// } +// fmt.Printf("Hymn %s appears in %d category(ies):\n\n", label, len(results)) +// prevH1 := "" +// for _, e := range results { +// if e.H1 != prevH1 { +// fmt.Printf(" [ %s ]\n", e.H1) +// prevH1 = e.H1 +// } +// verseNote := "" +// if e.Verse != 0 { +// verseNote = fmt.Sprintf(" (verse %d)", e.Verse) +// } +// fmt.Printf(" • %s%s\n", e.H2, verseNote) +// } +// fmt.Println() +// } + +// func parseArg(s string) (num, verse int, err error) { +// if strings.Contains(s, ":") { +// parts := strings.SplitN(s, ":", 2) +// num, err = strconv.Atoi(parts[0]) +// if err != nil { +// return +// } +// verse, err = strconv.Atoi(parts[1]) +// return +// } +// num, err = strconv.Atoi(s) +// return +// } + +// func interactive(index []Entry) { +// scanner := bufio.NewScanner(os.Stdin) +// fmt.Println("Hymn Index Lookup — type a number (e.g. 287 or 287:3), or 'q' to quit.") +// for { +// fmt.Print("> ") +// if !scanner.Scan() { +// break +// } +// input := strings.TrimSpace(scanner.Text()) +// if input == "q" || input == "quit" || input == "exit" { +// break +// } +// if input == "" { +// continue +// } +// num, verse, err := parseArg(input) +// if err != nil { +// fmt.Println("Invalid input. Enter a number like 287 or 287:3.") +// continue +// } +// printResults(lookup(index, num, verse), num, verse) +// } +// } + +// func main() { +// index := buildIndex() + +// if len(os.Args) > 1 { +// // Non-interactive: hymn [287] or [287:3] +// for _, arg := range os.Args[1:] { +// num, verse, err := parseArg(arg) +// if err != nil { +// fmt.Fprintf(os.Stderr, "Invalid argument %q: %v\n", arg, err) +// continue +// } +// printResults(lookup(index, num, verse), num, verse) +// } +// return +// } + +// interactive(index) +// } diff --git a/files/index/.gitkeep b/files/index/.gitkeep new file mode 100644 index 0000000..4187ff3 --- /dev/null +++ b/files/index/.gitkeep @@ -0,0 +1 @@ +for index of bleve \ No newline at end of file diff --git a/go.mod b/go.mod index 61dbce3..763889b 100644 --- a/go.mod +++ b/go.mod @@ -1,25 +1,53 @@ module github.com/jodi-ivan/numbered-notation-xml -go 1.18 +go 1.23 + +toolchain go1.23.4 require ( + github.com/JoshVarga/svgparser v0.0.0-20200804023048-5eaba627a7d1 github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b + github.com/blevesearch/bleve/v2 v2.5.7 github.com/golang-collections/collections v0.0.0-20130729185459-604e922904d3 github.com/golang/mock v1.6.0 github.com/jarcoal/httpmock v1.3.1 github.com/jmoiron/sqlx v1.3.5 github.com/julienschmidt/httprouter v1.3.0 github.com/mattn/go-sqlite3 v1.14.18 - github.com/stretchr/testify v1.8.1 + github.com/stretchr/testify v1.10.0 gopkg.in/gcfg.v1 v1.2.3 ) require ( - github.com/JoshVarga/svgparser v0.0.0-20200804023048-5eaba627a7d1 // indirect + github.com/RoaringBitmap/roaring/v2 v2.4.5 // indirect + github.com/bits-and-blooms/bitset v1.22.0 // indirect + github.com/blevesearch/bleve_index_api v1.2.11 // indirect + github.com/blevesearch/geo v0.2.4 // indirect + github.com/blevesearch/go-faiss v1.0.26 // indirect + github.com/blevesearch/go-porterstemmer v1.0.3 // indirect + github.com/blevesearch/gtreap v0.1.1 // indirect + github.com/blevesearch/mmap-go v1.0.4 // indirect + github.com/blevesearch/scorch_segment_api/v2 v2.3.13 // indirect + github.com/blevesearch/segment v0.9.1 // indirect + github.com/blevesearch/snowballstem v0.9.0 // indirect + github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect + github.com/blevesearch/vellum v1.1.0 // indirect + github.com/blevesearch/zapx/v11 v11.4.2 // indirect + github.com/blevesearch/zapx/v12 v12.4.2 // indirect + github.com/blevesearch/zapx/v13 v13.4.2 // indirect + github.com/blevesearch/zapx/v14 v14.4.2 // indirect + github.com/blevesearch/zapx/v15 v15.4.2 // indirect + github.com/blevesearch/zapx/v16 v16.2.8 // indirect github.com/davecgh/go-spew v1.1.1 // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/json-iterator/go v0.0.0-20171115153421-f7279a603ede // indirect + github.com/mschoch/smat v0.2.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + go.etcd.io/bbolt v1.4.0 // indirect golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4 // indirect - golang.org/x/text v0.3.3 // indirect + golang.org/x/sys v0.29.0 // indirect + golang.org/x/text v0.8.0 // indirect + google.golang.org/protobuf v1.36.6 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 4d19f71..abc42fe 100644 --- a/go.sum +++ b/go.sum @@ -1,10 +1,51 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/JoshVarga/svgparser v0.0.0-20200804023048-5eaba627a7d1 h1:RAQocNl+YQYGPt5yh4SR5zFUIHKrXnLhjIGhHO4Vwnc= github.com/JoshVarga/svgparser v0.0.0-20200804023048-5eaba627a7d1/go.mod h1:tMmgUTWcco9d1ZmK7zjxuTv7XWZhyutXIsgu0uJ3gDw= +github.com/RoaringBitmap/roaring/v2 v2.4.5 h1:uGrrMreGjvAtTBobc0g5IrW1D5ldxDQYe2JW2gggRdg= +github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/hVXDS2dXi7/eUFE0= github.com/ajstarks/deck v0.0.0-20200831202436-30c9fc6549a9/go.mod h1:JynElWSGnm/4RlzPXRlREEwqTHAN3T56Bv2ITsFT3gY= github.com/ajstarks/deck/generate v0.0.0-20210309230005-c3f852c02e19/go.mod h1:T13YZdzov6OU0A1+RfKZiZN9ca6VeKdBdyDV+BY97Tk= github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b h1:slYM766cy2nI3BwyRiyQj/Ud48djTMtMebDqepE95rw= github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b/go.mod h1:1KcenG0jGWcpt8ov532z81sp/kMMUG485J2InIOyADM= +github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= +github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/blevesearch/bleve/v2 v2.5.7 h1:2d9YrL5zrX5EBBW++GOaEKjE+NPWeZGaX77IM26m1Z8= +github.com/blevesearch/bleve/v2 v2.5.7/go.mod h1:yj0NlS7ocGC4VOSAedqDDMktdh2935v2CSWOCDMHdSA= +github.com/blevesearch/bleve_index_api v1.2.11 h1:bXQ54kVuwP8hdrXUSOnvTQfgK0KI1+f9A0ITJT8tX1s= +github.com/blevesearch/bleve_index_api v1.2.11/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= +github.com/blevesearch/geo v0.2.4 h1:ECIGQhw+QALCZaDcogRTNSJYQXRtC8/m8IKiA706cqk= +github.com/blevesearch/geo v0.2.4/go.mod h1:K56Q33AzXt2YExVHGObtmRSFYZKYGv0JEN5mdacJJR8= +github.com/blevesearch/go-faiss v1.0.26 h1:4dRLolFgjPyjkaXwff4NfbZFdE/dfywbzDqporeQvXI= +github.com/blevesearch/go-faiss v1.0.26/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= +github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= +github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M= +github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y= +github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk= +github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= +github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= +github.com/blevesearch/scorch_segment_api/v2 v2.3.13 h1:ZPjv/4VwWvHJZKeMSgScCapOy8+DdmsmRyLmSB88UoY= +github.com/blevesearch/scorch_segment_api/v2 v2.3.13/go.mod h1:ENk2LClTehOuMS8XzN3UxBEErYmtwkE7MAArFTXs9Vc= +github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= +github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= +github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s= +github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs= +github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A= +github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= +github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w= +github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= +github.com/blevesearch/zapx/v11 v11.4.2 h1:l46SV+b0gFN+Rw3wUI1YdMWdSAVhskYuvxlcgpQFljs= +github.com/blevesearch/zapx/v11 v11.4.2/go.mod h1:4gdeyy9oGa/lLa6D34R9daXNUvfMPZqUYjPwiLmekwc= +github.com/blevesearch/zapx/v12 v12.4.2 h1:fzRbhllQmEMUuAQ7zBuMvKRlcPA5ESTgWlDEoB9uQNE= +github.com/blevesearch/zapx/v12 v12.4.2/go.mod h1:TdFmr7afSz1hFh/SIBCCZvcLfzYvievIH6aEISCte58= +github.com/blevesearch/zapx/v13 v13.4.2 h1:46PIZCO/ZuKZYgxI8Y7lOJqX3Irkc3N8W82QTK3MVks= +github.com/blevesearch/zapx/v13 v13.4.2/go.mod h1:knK8z2NdQHlb5ot/uj8wuvOq5PhDGjNYQQy0QDnopZk= +github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT7fWYz0= +github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= +github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k= +github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= +github.com/blevesearch/zapx/v16 v16.2.8 h1:SlnzF0YGtSlrsOE3oE7EgEX6BIepGpeqxs1IjMbHLQI= +github.com/blevesearch/zapx/v16 v16.2.8/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -14,10 +55,20 @@ github.com/golang-collections/collections v0.0.0-20130729185459-604e922904d3 h1: github.com/golang-collections/collections v0.0.0-20130729185459-604e922904d3/go.mod h1:nPpo7qLxd6XL3hWJG/O60sR8ZKfMCiIoNap5GvD12KU= github.com/golang/mock v1.6.0 h1:ErTB+efbowRARo13NNdxyJji2egdxLGQhRaY+DUumQc= github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/jarcoal/httpmock v1.3.1 h1:iUx3whfZWVf3jT01hQTO/Eo5sAYtB2/rqaUuOtpInww= github.com/jarcoal/httpmock v1.3.1/go.mod h1:3yb8rc4BI7TCBhFY8ng0gjuLKJNquuDNiPaZjnENuYg= github.com/jmoiron/sqlx v1.3.5 h1:vFFPA71p1o5gAeqtEAwLU4dnX2napprKtHr7PYIcN3g= github.com/jmoiron/sqlx v1.3.5/go.mod h1:nRVWtLre0KfCLJvgxzCsLVMogSvQ1zNJtpYr2Ccp0mQ= +github.com/json-iterator/go v0.0.0-20171115153421-f7279a603ede h1:YrgBGwxMRK0Vq0WSCWFaZUnTsrA/PZE/xs1QZh+/edg= +github.com/json-iterator/go v0.0.0-20171115153421-f7279a603ede/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/julienschmidt/httprouter v1.3.0 h1:U0609e9tgbseu3rBINet9P48AI/D3oJs4dN7jwJOQ1U= github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= @@ -27,17 +78,19 @@ github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A github.com/mattn/go-sqlite3 v1.14.18 h1:JL0eqdCOq6DJVNPSvArO/bIV9/P7fbGrV00LZHc+5aI= github.com/mattn/go-sqlite3 v1.14.18/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= github.com/maxatome/go-testdeep v1.12.0 h1:Ql7Go8Tg0C1D/uMMX59LAoYK7LffeJQ6X2T04nTH68g= +github.com/maxatome/go-testdeep v1.12.0/go.mod h1:lPZc/HAcJMP92l7yI6TRz1aZN5URwUBUAfUNvrclaNM= +github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= +github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= -github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= +go.etcd.io/bbolt v1.4.0 h1:TU77id3TnN/zKr7CO/uk+fBCwF2jGcMuw2B/FMAzYIk= +go.etcd.io/bbolt v1.4.0/go.mod h1:AsD+OCi/qPN1giOX1aiLAha3o1U8rAz65bvN4j0sRuk= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -51,6 +104,8 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -58,10 +113,14 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU= +golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68= +golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= @@ -69,6 +128,8 @@ golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/gcfg.v1 v1.2.3 h1:m8OOJ4ccYHnx2f4gQwpno8nAX5OGOh7RLaaz0pj3Ogs= @@ -76,6 +137,7 @@ gopkg.in/gcfg.v1 v1.2.3/go.mod h1:yesOnuUOFQAhST5vPY4nbZsb/huCgGGXlipJsBn0b3o= gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME= gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.1.3/go.mod h1:NgwopIslSNH47DimFoV78dnkksY2EFtX0ajyb3K/las= From 4c4da930c0e1bc4c2a68feff3cb6133bed8a6725 Mon Sep 17 00:00:00 2001 From: Jody Ivan Lumbantoruan Date: Sat, 21 Mar 2026 10:35:04 +0700 Subject: [PATCH 2/3] feat(search): dedicated ngram field --- cmd/lab/search/main.go | 147 +++++++++++-------------- cmd/lab/search/query/main.go | 204 +++++++++++++++++++++++++++++------ 2 files changed, 235 insertions(+), 116 deletions(-) diff --git a/cmd/lab/search/main.go b/cmd/lab/search/main.go index c22950d..a74bb3c 100644 --- a/cmd/lab/search/main.go +++ b/cmd/lab/search/main.go @@ -10,7 +10,13 @@ import ( "strings" "unicode" + _ "github.com/blevesearch/bleve/v2/analysis/analyzer/custom" + _ "github.com/blevesearch/bleve/v2/analysis/token/lowercase" + _ "github.com/blevesearch/bleve/v2/analysis/token/ngram" + _ "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" + "github.com/blevesearch/bleve/v2" + "github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/jodi-ivan/numbered-notation-xml/cmd/lab/search/usage" "github.com/jodi-ivan/numbered-notation-xml/internal/lyric" "github.com/jodi-ivan/numbered-notation-xml/internal/musicxml" @@ -24,6 +30,11 @@ func init() { if originalTitleExp == nil { originalTitleExp = regexp.MustCompile(".*") } + + // registry.RegisterTokenFilter("ngram", func(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + // return ngram.NgramFilterConstructor(config, cache) + // }) + } type VerseDoc struct { @@ -46,95 +57,26 @@ type Document struct { ForKids bool `json:"for_kids"` Copyright string `json:"copyright,omitempty"` MusicCredit string `json:"music"` + Title_ngram string `json:"title_ngram"` } -var categories = map[string]map[string][2]int{ - "Menghadap Allah": map[string][2]int{ - "Puji-pujian dan Pembukaan Ibadah": [2]int{1, 22}, - "Pengakuan dan Pengampunan Dosa": [2]int{23, 41}, - "Kyrie dan Gloria": [2]int{42, 48}, - }, - "Pelayanan Firman": map[string][2]int{ - "Pembacaan Alkitab": [2]int{49, 59}, - "Penciptaan dan Pemeliharaan": [2]int{60, 69}, - "Pejanjian Lama": [2]int{70, 75}, - "Penantian Mesias dan Masa Adven": [2]int{76, 91}, - "Kelahiran Yesus dan Masa Natal": [2]int{92, 127}, - "Akhir Masa Natal dan Epifania": [2]int{128, 143}, - "Kisah Pelayanan Yesus": [2]int{144, 154}, - "Masa Prapaskah": [2]int{155, 163}, - "Sengsara Yesus dan Jumat Agung": [2]int{164, 186}, - "Kebangkitan Yesus dan Masa Paskah": [2]int{187, 217}, - "Hari Kenaikan": [2]int{218, 227}, - "Roh Kudus dan Hari Pentakosta": [2]int{228, 241}, - "Allah Tritunggal dan Hari Trinitatis": [2]int{242, 246}, - "Gereja dan Kerajaan Allah": [2]int{247, 261}, - "Kehidupan Sorgawi": [2]int{262, 271}, - "Akhir Zaman dan Penggenapan": [2]int{272, 279}, - }, - "Respons Terhadap Pelayanan Firman": map[string][2]int{ - "Pernyataan Keyakinan Iman": [2]int{280, 285}, - "Pengucapan Syukur dan Persembahan": [2]int{286, 303}, - }, - "Pelayanan Khusus": map[string][2]int{ - "Baptisan Kudus dan Peneguhan Sidi": [2]int{304, 309}, - "Perjamuan Kudus": [2]int{310, 315}, - "Pernikahan": [2]int{316, 318}, - "Peristiwa Isimewa Gerejawi": [2]int{319, 320}, - }, - "Waktu dan Musim": map[string][2]int{ - "Pagi dan Siang": [2]int{321, 323}, - "Petang dan Malam": [2]int{323, 329}, - "Pergantian Tahun": [2]int{330, 332}, - "Musim dan Panen": [2]int{333, 335}, - "Bangsa dan Negara": [2]int{336, 337}, - }, - "Penutupan Ibadah": map[string][2]int{ - "Pungutusan": [2]int{338, 344}, - "Berkat": [2]int{345, 350}, - }, - "Hidup Beriman Sehari-hari": map[string][2]int{ - "Panggilan Juruselamat": [2]int{351, 360}, - "Penyerahan Diri": [2]int{361, 376}, - "Kebesaran Rahmat Tuhan": [2]int{377, 390}, - "Sukacita dalam Tuhan": [2]int{391, 399}, - "Hidup Bersama Tuhan": [2]int{400, 405}, - "Tuntunan Tuhan": [2]int{406, 421}, - "Tanggung Jawab Pengikut Kristus": [2]int{422, 437}, - "Kemenangan dalam Perjuangan": [2]int{438, 446}, - "Keluarga dan Persekutuan": [2]int{447, 451}, - "Doa dan Setiap Waktu": [2]int{452, 471}, - }, - "Haleluya, Amin dan Lain-lain": map[string][2]int{ - "": [2]int{472, 478}, - }, -} - -func GetCategory(num int) (string, string) { - - for cat, subs := range categories { - for subcat, ranges := range subs { - if num >= ranges[0] && num <= ranges[1] { - return cat, subcat - } - } +func GetOriginalTitle(metadata *repository.HymnMetadata) []string { + result := []string{} + if strings.HasPrefix(metadata.Lyric, "") { + index := strings.Index(metadata.Lyric, "") + lyric := metadata.Lyric[3:index] + lyric = strings.TrimSpace(lyric) + lyric = strings.TrimSuffix(lyric, ",") + result = strings.Split(lyric, "/") + } else { + result = []string{metadata.Title} } - return "", "" - -} - -func GetOriginalTitle(metadata *repository.HymnMetadata) []string { - if strings.Contains(metadata.Lyric, "") { - submatch := originalTitleExp.FindStringSubmatch(metadata.Lyric) - if len(submatch) > 0 && len(submatch[0]) > 6 { - stripped := submatch[0][3 : len(submatch[0])-4] - strings.TrimSuffix(stripped, ",") - return strings.Split(stripped, "/") - } + for i, v := range result { + result[i] = strings.TrimSpace(v) } - return []string{metadata.Title} + return result } func BuildContent(repo repository.Repository, metadata *repository.HymnMetadata, verse int) string { @@ -242,6 +184,7 @@ func BuildDocument(repo repository.Repository, num int, vaiant ...string) (*Docu doc := &Document{ ID: strconv.Itoa(metadata.HymnID), Title: metadata.Title, + Title_ngram: metadata.Title, HymnNo: metadata.Number, BE: int(metadata.RefBE.Int16), NR: int(metadata.RefNR.Int16), @@ -305,6 +248,30 @@ func main() { } defer index.Close() + // 1. Create a custom token filter for n-grams (3 to 6 characters is usually sweet spot) + err = indexMapping.AddCustomTokenFilter("my_ngram", + map[string]interface{}{ + "type": "ngram", + "min": 3, + "max": 6, + }) + if err != nil { + log.Fatal(err) + } + // 2. Create an analyzer using that filter + err = indexMapping.AddCustomAnalyzer("part_word_analyzer", + map[string]interface{}{ + "type": "custom", + "tokenizer": "unicode", + "token_filters": []string{ + lowercase.Name, + "my_ngram", + }, + }) + if err != nil { + log.Fatal(err) + } + docMapping := bleve.NewDocumentMapping() contentMapping := bleve.NewTextFieldMapping() @@ -332,9 +299,16 @@ func main() { hymnMap := bleve.NewNumericFieldMapping() hymnMap.Store = true - titleMap := bleve.NewNumericFieldMapping() + titleMap := bleve.NewTextFieldMapping() titleMap.Store = true + titleNgram := bleve.NewTextFieldMapping() + titleNgram.Store = true + titleNgram.Analyzer = "part_word_analyzer" + + ogMap := bleve.NewTextFieldMapping() + ogMap.Store = true + versesMapping.AddFieldMappingsAt("no", noMapping) versesMapping.AddFieldMappingsAt("content", verseContentMapping) versesMapping.AddFieldMappingsAt("start", startMapping) @@ -343,6 +317,8 @@ func main() { docMapping.AddFieldMappingsAt("variant", variantMapping) docMapping.AddFieldMappingsAt("hymn_no", hymnMap) docMapping.AddFieldMappingsAt("title", titleMap) + docMapping.AddFieldMappingsAt("original_title", ogMap) + docMapping.AddFieldMappingsAt("title_ngram", titleNgram) indexMapping.DefaultMapping = docMapping @@ -361,7 +337,7 @@ func main() { // Add documents documents := []*Document{} - for no := 1; no <= 25; no++ { + for no := 1; no <= 214; no++ { var doc *Document var err error if variant, ok := variants[no]; ok { @@ -392,7 +368,6 @@ func main() { batch := index.NewBatch() for _, doc := range documents { batch.Index(doc.ID, doc) - log.Println("indexing", doc.ID) } if err := index.Batch(batch); err != nil { log.Fatal(err) diff --git a/cmd/lab/search/query/main.go b/cmd/lab/search/query/main.go index 0515506..b788044 100644 --- a/cmd/lab/search/query/main.go +++ b/cmd/lab/search/query/main.go @@ -5,10 +5,39 @@ import ( "log" "github.com/blevesearch/bleve/v2" + _ "github.com/blevesearch/bleve/v2/analysis/lang/id" ) +type VerseDoc struct { + No int `json:"no"` + Content string `json:"content"` + Start int `json:"start"` + End int `json:"end"` +} + +func toFloat64Slice(v interface{}) []float64 { + switch val := v.(type) { + case []interface{}: + result := make([]float64, len(val)) + for i, item := range val { + result[i] = item.(float64) + } + return result + case float64: + return []float64{val} // wrap scalar in slice + } + return nil +} + func main() { + // db, err := storage.NewStorage(context.Background(), "files/database/kidung-jemaat.db") + // if err != nil { + // log.Fatalf("Failed to connect to storage: %s", err.Error()) + // return + // } + // repo := repository.New(context.Background(), db) + indexPath := "files/index/kj.bleve" // Load the existing index @@ -18,53 +47,168 @@ func main() { } defer index.Close() - // Search the created index - query := bleve.NewMatchQuery("sembah") - query.Fuzziness = 1 - query.Prefix = 1 - searchRequest := bleve.NewSearchRequest(query) - searchRequest.Fields = []string{"hymn_no", "title", "verses.no", "verses.content", "verses.start", "verses.end", "variant"} + // q := bleve.NewWildcardQuery("*dalam*") + // q.SetField("content") + + // q3 := bleve.NewWildcardQuery("*dalam*") + // q3.SetBoost(3) + // q3.SetField("title") + + // combined1 := bleve.NewDisjunctionQuery(q, q3) + + // searchRequest := bleve.NewSearchRequest(combined1) + // searchRequest.IncludeLocations = true + // searchRequest.Fields = []string{"hymn_no", "content", "title", "verses.no", "verses.content", "verses.start", "verses.end", "variant", "original_title"} + // searchRequest.Highlight = bleve.NewHighlight() + // searchRequest.Explain = true + // searchResult, err := index.Search(searchRequest) + + // if err != nil { + // log.Fatal(err) + // } + + // log.Println("taken:", searchResult.Took) + + // // result := map[int][]int{} + // for _, hit := range searchResult.Hits { + + // matchedVerseNos := map[int][]string{} + + // if locations, ok := hit.Locations["content"]; ok { + + // rawStarts := toFloat64Slice(hit.Fields["verses.start"]) + // rawEnds := toFloat64Slice(hit.Fields["verses.end"]) + // rawNos := toFloat64Slice(hit.Fields["verses.no"]) + // log.Println("Score", hit.Fields["hymn_no"].(float64), ": ", hit.Score) + // for term, termLocations := range locations { + // for _, loc := range termLocations { + // // find which verse this offset falls in + // for i := range rawStarts { + // start := rawStarts[i] + // end := rawEnds[i] + // if loc.Start >= uint64(start) && loc.Start < uint64(end) { + // if matchedVerseNos[int(rawNos[i])] == nil { + // matchedVerseNos[int(rawNos[i])] = []string{} + // } + // matchedVerseNos[int(rawNos[i])] = append(matchedVerseNos[int(rawNos[i])], term) + // break + // } + // } + // } + // } + // var verseNos []int + // for no := range matchedVerseNos { + // verseNos = append(verseNos, no) + // } + // sort.Ints(verseNos) + + // for _, no := range verseNos { + // fmt.Printf("Found: %s. hymn %.0f — matched verses: %v\n", matchedVerseNos[no], hit.Fields["hymn_no"].(float64), no) + // } + // fmt.Println("") + // } + + // if locations, ok := hit.Locations["title"]; ok { + // for term, termLocations := range locations { + // for _, loc := range termLocations { + // // find which verse this offset falls in + // fmt.Printf("[%.0f] Title term %q matched at byte offset %d: %s\n", hit.Fields["hymn_no"].(float64), term, loc.Start, hit.Fields["title"]) + + // } + // } + // } + + // } + + // q1 := bleve.NewWildcardQuery("*dalam*") + // q1.SetField("title") + // q1.SetBoost(3) + + // q2 := bleve.NewWildcardQuery("*dalam*") + // q2.SetField("original_title") + + // combined := bleve.NewDisjunctionQuery(q1, q2) + + // searchRequest1 := bleve.NewSearchRequest(combined) + // searchRequest1.IncludeLocations = true + // searchRequest1.Fields = []string{"hymn_no", "content", "title", "verses.no", "verses.content", "verses.start", "verses.end", "variant"} + // searchRequest1.Highlight = bleve.NewHighlight() + // searchRequest1.Explain = true + // searchResult1, err := index.Search(searchRequest1) + + // if err != nil { + // log.Fatal(err) + // } + + // log.Println("taken:", searchResult1.Took) + + // // result := map[int][]int{} + // for _, hit := range searchResult1.Hits { + + // log.Println("===================", hit.Fields["hymn_no"], hit.Fields["variant"]) + + // log.Println("Score", hit.Fields["hymn_no"].(float64), ": ", hit.Score) + + // if locations, ok := hit.Locations["original_title"]; ok { + // for term, termLocations := range locations { + // for _, loc := range termLocations { + // // find which verse this offset falls in + // fmt.Printf("Original title term %q matched at byte offset %d: %s\n", term, loc.Start, hit.Fields["original_title"]) + // } + // } + // } + + // if locations, ok := hit.Locations["title"]; ok { + // for term, termLocations := range locations { + // for _, loc := range termLocations { + // // find which verse this offset falls in + // fmt.Printf("Title term %q matched at byte offset %d: %s\n", term, loc.Start, hit.Fields["title"]) + // } + // } + // } + + // fmt.Println("") + + // } + + q := bleve.NewPrefixQuery("hai") + q.SetField("title_ngram") + + // q1 := bleve.NewMatchQuery("t'lah") + // q1.SetField("title_ngram") + + // combined := bleve.NewDisjunctionQuery(q1, q) + + searchRequest := bleve.NewSearchRequest(q) + searchRequest.IncludeLocations = true + searchRequest.Fields = []string{"hymn_no", "variant", "title", "title_ngram"} searchRequest.Highlight = bleve.NewHighlight() searchRequest.Explain = true searchResult, err := index.Search(searchRequest) + if err != nil { log.Fatal(err) } - result := map[int][]int{} + log.Println("taken:", searchResult.Took) + for _, hit := range searchResult.Hits { - // starts := hit.Fields["verses.start"].([]interface{}) - // ends := hit.Fields["verses.end"].([]interface{}) + log.Println("===================", hit.Fields["hymn_no"], hit.Fields["variant"]) - // for i, s := range starts { - // start := int(s.(float64)) - // end := int(ends[i].(float64)) - // } + log.Println("Score", hit.Fields["hymn_no"].(float64), ": ", hit.Score) - // hit.Fields contains your stored values - // hit.Locations contains the match positions per field - log.Println("===================", hit.Fields["hymn_no"], hit.Fields["variant"], "|", hit.Fields["verses.start"], hit.Fields["verses.end"], hit.Fields["verses.no"]) - // get match positions in "content" - if locations, ok := hit.Locations["content"]; ok { + if locations, ok := hit.Locations["title_ngram"]; ok { for term, termLocations := range locations { for _, loc := range termLocations { - - fmt.Printf("term %q matched at byte offset %d\n", term, loc.Start) - hymnNo := hit.Fields["hymn_no"].(float64) - if result[int(hymnNo)] == nil { - result[int(hymnNo)] = []int{} - } - - result[int(hymnNo)] = append(result[int(hymnNo)], int(loc.Start)) - - log.Println("document ID", hit.ID) - + // find which verse this offset falls in + fmt.Printf("Original title term %q matched at byte offset %d: %s\n", term, loc.Start, hit.Fields["title_ngram"]) } } } - log.Println(result) + fmt.Println("") + } } From 1881d33e186dd8d013f601c451faa8224ec0ff19 Mon Sep 17 00:00:00 2001 From: Jody Ivan Lumbantoruan Date: Sat, 21 Mar 2026 17:01:30 +0700 Subject: [PATCH 3/3] style(cleanup): cleanup --- .../{usage/usage.go => category/category.go} | 84 +------ cmd/lab/search/main.go | 53 ++--- cmd/lab/verses-generator/main.go | 211 ++++++++++++++++-- 3 files changed, 210 insertions(+), 138 deletions(-) rename cmd/lab/search/{usage/usage.go => category/category.go} (78%) diff --git a/cmd/lab/search/usage/usage.go b/cmd/lab/search/category/category.go similarity index 78% rename from cmd/lab/search/usage/usage.go rename to cmd/lab/search/category/category.go index f1aa00f..4313a24 100644 --- a/cmd/lab/search/usage/usage.go +++ b/cmd/lab/search/category/category.go @@ -1,4 +1,4 @@ -package usage +package category import ( "fmt" @@ -125,85 +125,3 @@ func Lookup(num, verse int) []Entry { } return results } - -// func printResults(results []Entry, num, verse int) { -// label := strconv.Itoa(num) -// if verse != 0 { -// label += fmt.Sprintf(":%d", verse) -// } -// if len(results) == 0 { -// fmt.Printf("No categories found for hymn %s.\n", label) -// return -// } -// fmt.Printf("Hymn %s appears in %d category(ies):\n\n", label, len(results)) -// prevH1 := "" -// for _, e := range results { -// if e.H1 != prevH1 { -// fmt.Printf(" [ %s ]\n", e.H1) -// prevH1 = e.H1 -// } -// verseNote := "" -// if e.Verse != 0 { -// verseNote = fmt.Sprintf(" (verse %d)", e.Verse) -// } -// fmt.Printf(" • %s%s\n", e.H2, verseNote) -// } -// fmt.Println() -// } - -// func parseArg(s string) (num, verse int, err error) { -// if strings.Contains(s, ":") { -// parts := strings.SplitN(s, ":", 2) -// num, err = strconv.Atoi(parts[0]) -// if err != nil { -// return -// } -// verse, err = strconv.Atoi(parts[1]) -// return -// } -// num, err = strconv.Atoi(s) -// return -// } - -// func interactive(index []Entry) { -// scanner := bufio.NewScanner(os.Stdin) -// fmt.Println("Hymn Index Lookup — type a number (e.g. 287 or 287:3), or 'q' to quit.") -// for { -// fmt.Print("> ") -// if !scanner.Scan() { -// break -// } -// input := strings.TrimSpace(scanner.Text()) -// if input == "q" || input == "quit" || input == "exit" { -// break -// } -// if input == "" { -// continue -// } -// num, verse, err := parseArg(input) -// if err != nil { -// fmt.Println("Invalid input. Enter a number like 287 or 287:3.") -// continue -// } -// printResults(lookup(index, num, verse), num, verse) -// } -// } - -// func main() { -// index := buildIndex() - -// if len(os.Args) > 1 { -// // Non-interactive: hymn [287] or [287:3] -// for _, arg := range os.Args[1:] { -// num, verse, err := parseArg(arg) -// if err != nil { -// fmt.Fprintf(os.Stderr, "Invalid argument %q: %v\n", arg, err) -// continue -// } -// printResults(lookup(index, num, verse), num, verse) -// } -// return -// } - -// interactive(index) -// } diff --git a/cmd/lab/search/main.go b/cmd/lab/search/main.go index a74bb3c..9baa7d3 100644 --- a/cmd/lab/search/main.go +++ b/cmd/lab/search/main.go @@ -5,7 +5,6 @@ import ( "encoding/json" "fmt" "log" - "regexp" "strconv" "strings" "unicode" @@ -17,26 +16,13 @@ import ( "github.com/blevesearch/bleve/v2" "github.com/blevesearch/bleve/v2/analysis/token/lowercase" - "github.com/jodi-ivan/numbered-notation-xml/cmd/lab/search/usage" + "github.com/jodi-ivan/numbered-notation-xml/cmd/lab/search/category" "github.com/jodi-ivan/numbered-notation-xml/internal/lyric" "github.com/jodi-ivan/numbered-notation-xml/internal/musicxml" "github.com/jodi-ivan/numbered-notation-xml/svc/repository" "github.com/jodi-ivan/numbered-notation-xml/utils/storage" ) -var originalTitleExp *regexp.Regexp - -func init() { - if originalTitleExp == nil { - originalTitleExp = regexp.MustCompile(".*") - } - - // registry.RegisterTokenFilter("ngram", func(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { - // return ngram.NgramFilterConstructor(config, cache) - // }) - -} - type VerseDoc struct { No int `json:"no"` Content string `json:"content"` @@ -44,20 +30,20 @@ type VerseDoc struct { End int `json:"end"` } type Document struct { - ID string `json:"id"` - Title string `json:"title"` - Content string `json:"content"` - Catergory []usage.Entry `json:"categories,omitempty"` - HymnNo int `json:"hymn_no"` - Variant string `json:"variant,omitempty"` - Verses []VerseDoc `json:"verses"` - OriginalTitle []string `json:"original_title"` - BE int `json:"be_num,omitempty"` - NR int `json:"nr_num,omitempty"` - ForKids bool `json:"for_kids"` - Copyright string `json:"copyright,omitempty"` - MusicCredit string `json:"music"` - Title_ngram string `json:"title_ngram"` + ID string `json:"id"` + Title string `json:"title"` + Content string `json:"content"` + Catergory []category.Entry `json:"categories,omitempty"` + HymnNo int `json:"hymn_no"` + Variant string `json:"variant,omitempty"` + Verses []VerseDoc `json:"verses"` + OriginalTitle []string `json:"original_title"` + BE int `json:"be_num,omitempty"` + NR int `json:"nr_num,omitempty"` + ForKids bool `json:"for_kids"` + Copyright string `json:"copyright,omitempty"` + MusicCredit string `json:"music"` + Title_ngram string `json:"title_ngram"` } func GetOriginalTitle(metadata *repository.HymnMetadata) []string { @@ -164,9 +150,6 @@ func buildContentWithOffsets(verses []VerseDoc) (string, []VerseDoc) { verses[i].Start = sb.Len() sb.WriteString(verses[i].Content) verses[i].End = sb.Len() - // if i < len(verses)-1 { - // sb.WriteString(" ") - // } } return sb.String(), verses } @@ -191,7 +174,7 @@ func BuildDocument(repo repository.Repository, num int, vaiant ...string) (*Docu Copyright: metadata.Copyright.String, MusicCredit: metadata.Music, OriginalTitle: GetOriginalTitle(metadata), - Catergory: []usage.Entry{}, + Catergory: []category.Entry{}, } if len(vaiant) > 0 { @@ -200,7 +183,7 @@ func BuildDocument(repo repository.Repository, num int, vaiant ...string) (*Docu } verses := make([]VerseDoc, len(metadata.Verse)+1) for verse := 1; verse <= len(metadata.Verse)+1; verse++ { - currCats := usage.Lookup(num, verse) + currCats := category.Lookup(num, verse) for _, curr := range currCats { if cats[curr.H1] == nil { cats[curr.H1] = []string{} @@ -218,7 +201,7 @@ func BuildDocument(repo repository.Repository, num int, vaiant ...string) (*Docu continue } h2s[h2] = true - doc.Catergory = append(doc.Catergory, usage.Entry{H1: h1, H2: h2}) + doc.Catergory = append(doc.Catergory, category.Entry{H1: h1, H2: h2}) } } diff --git a/cmd/lab/verses-generator/main.go b/cmd/lab/verses-generator/main.go index 89481b9..fc1fbdb 100644 --- a/cmd/lab/verses-generator/main.go +++ b/cmd/lab/verses-generator/main.go @@ -1,11 +1,16 @@ package main import ( + "context" + "database/sql" "encoding/json" "log" "strings" + "unicode" - "github.com/jodi-ivan/numbered-notation-xml/internal/lyric" + "github.com/jmoiron/sqlx" + "github.com/jodi-ivan/numbered-notation-xml/cmd/lab/verse" + "github.com/jodi-ivan/numbered-notation-xml/utils/storage" ) type WordBreakdown struct { @@ -13,39 +18,205 @@ type WordBreakdown struct { Breakdown []string } +// Result holds the full processing output for a sentence. +type Result struct { + Breakdown string // e.g. "ma-(lai)-kat se-la-lu" + NotInDB []string // clean words that were not found in DB +} + +// WordResult holds per-word processing output. +type WordResult struct { + Breakdown string + InDB bool +} + type Line []WordBreakdown func main() { + dbPath := `/home/jodiivan/go/src/github.com/jodi-ivan/numbered-notation-xml/files/database/kidung-jemaat.db` + db, err := storage.NewStorage(context.Background(), dbPath) + if err != nil { + log.Fatalf("Failed to connect to storage: %s", err.Error()) + return + } - result := []Line{} + defer db.Close() verses := ` - Kerubim dan serafim - memuliakan Yang Trisuci; - para rasul dan nabi, - martir yang berjubah putih - G'reja yang kudus, esa, - kepadaMu menyembah. + _malaikat _kekuatan _Daud _Tahu ` lines := strings.Split(verses, "\n") for _, l := range lines { - line := []WordBreakdown{} - words := strings.Fields(l) - if len(words) == 0 { + res, _ := ProcessSentence(db, l) + raw, _ := json.MarshalIndent(res, "", " ") + log.Println(string(raw)) + } + +} + +// splitTrailing splits a word into its alpha core and trailing symbols. +// Trailing = punctuation (.,!?;:) and quotes (" ' ") +func splitTrailing(word string) (core, trail string) { + i := len(word) + for i > 0 { + r := rune(word[i-1]) + if unicode.IsPunct(r) || r == '"' { + i-- + } else { + break + } + } + return word[:i], word[i:] +} + +// isElided returns true and strips the underscore prefix. +func isElided(word string) (string, bool) { + if strings.HasPrefix(word, "_") { + return word[1:], true + } + return word, false +} + +// applyElision wraps the elision_index-th syllable (1-based) in parentheses. +// e.g. breakdown="ma-lai-kat", elisionIndex=1 → "ma-(lai)-kat" +func applyElision(breakdown string, elisionIndex int) string { + parts := strings.Split(breakdown, "-") + if elisionIndex+1 > len(parts) { + return breakdown + } + + log.Println(elisionIndex, parts) + parts[elisionIndex] = "_" + parts[elisionIndex] + return strings.Join(parts, "-") +} + +// fallbackBreakdown is your own algorithm for words not in DB. +// Replace the body with your real implementation. +func fallbackBreakdown(word string) string { + return strings.Join(verse.SplitSyllable(word), "-") +} + +// ProcessSentence looks up each word in the DB, applies elision if marked, +// falls back to your algorithm for unknown words, and preserves casing/trailing symbols. +func ProcessSentence(db *sqlx.DB, sentence string) (Result, error) { + tokens := strings.Fields(sentence) + var breakdownParts []string + var notInDB []string + + for _, token := range tokens { + // 1. detect elision marker + raw, elided := isElided(token) + + // 2. split trailing punctuation/quotes + core, trail := splitTrailing(raw) + + pretrail := "" + if core[0] == '"' { + core = core[1:] + pretrail = "" + } + + // 3. preserve original casing for output; query in lowercase + lookup := strings.ToLower(core) + + // 4. query DB + bd, elisionIdx, found, err := queryWord(db, lookup, elided) + if err != nil { + return Result{}, err + } + + var wordBreakdown string + if found { + if elided && elisionIdx.Valid { + wordBreakdown = applyElision(bd, int(elisionIdx.Int64)) + } else { + wordBreakdown = bd + } + } else { + // not in DB — use fallback algorithm + wordBreakdown = fallbackBreakdown(lookup) + if elided { + wordBreakdown = "_" + wordBreakdown + lookup = "_" + lookup + } + notInDB = append(notInDB, lookup) // original casing, no trail + } + + res := syncCasing(core, wordBreakdown) + + log.Println(core, wordBreakdown) + + // 5. re-attach trailing symbols + breakdownParts = append(breakdownParts, pretrail+res+trail) + } + + return Result{ + Breakdown: strings.Join(breakdownParts, " "), + NotInDB: notInDB, + }, nil +} + +func syncCasing(original, hyphenated string) string { + var result strings.Builder + origRunes := []rune(original) + origIdx := 0 + + prefixes := map[int]string{} + + for _, char := range hyphenated { + if char == '-' { + result.WriteRune('-') + continue + } + + if char == '_' { + prefixes[origIdx] = "_" continue } - for _, w := range words { - syllable := lyric.SplitSyllable(w) - line = append(line, WordBreakdown{ - Word: w, - Breakdown: syllable, - }) + + if p, ok := prefixes[origIdx]; ok { + result.WriteRune(rune(p[0])) + } + // log.Println("char should be", string(origRunes[origIdx])) + // If the original character was uppercase, make this one uppercase + if origIdx < len(origRunes) && unicode.IsUpper(origRunes[origIdx]) { + result.WriteRune(unicode.ToUpper(origRunes[origIdx])) + } else { + result.WriteRune(unicode.ToLower(char)) } - result = append(result, line) + origIdx++ } - raw, _ := json.MarshalIndent(result, "", " ") - log.Println(string(raw)) + return result.String() +} + +// queryWord fetches breakdown and elision_index from DB. +// When elided=true it looks for the row WHERE elision_index IS NOT NULL, +// otherwise WHERE elision_index IS NULL. +func queryWord(db *sqlx.DB, word string, elided bool) (breakdown string, elisionIdx sql.NullInt64, found bool, err error) { + var query string + if elided { + query = ` + SELECT breakdown, elision_index + FROM syllable_breakdown + WHERE whole = ? AND elision_index IS NOT NULL + LIMIT 1` + } else { + query = ` + SELECT breakdown, elision_index + FROM syllable_breakdown + WHERE whole = ? AND elision_index IS NULL + LIMIT 1` + } + row := db.QueryRow(query, strings.ToLower(word)) + err = row.Scan(&breakdown, &elisionIdx) + if err == sql.ErrNoRows { + return "", sql.NullInt64{}, false, nil + } + if err != nil { + return "", sql.NullInt64{}, false, err + } + return breakdown, elisionIdx, true, nil }