Skip to content

Commit d257674

Browse files
Merge branch 'claude/japanese-vocab-matcher-011CUv6yn8vCn3LYkdUKvqVr'
2 parents 9e7eb03 + a8d4963 commit d257674

7 files changed

Lines changed: 1326 additions & 48 deletions

File tree

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Find and group synonym/alternative writing collocation pairs.
4+
5+
This script identifies noun pairs that are synonyms or alternative writings, such as:
6+
- 晩ご飯 vs 夕飯 (dinner)
7+
- 弁当 vs お弁当 (lunch box, honorific vs plain)
8+
- Alternative kanji/hiragana writings
9+
"""
10+
11+
import json
12+
from pathlib import Path
13+
from difflib import SequenceMatcher
14+
15+
def load_vocabulary():
16+
"""Load vocabulary from JSON file"""
17+
vocab_path = Path("public/data/vocabulary.json")
18+
with open(vocab_path, 'r', encoding='utf-8') as f:
19+
data = json.load(f)
20+
return data['vocabulary']
21+
22+
def calculate_similarity(str1, str2):
23+
"""Calculate similarity ratio between two strings"""
24+
return SequenceMatcher(None, str1.lower(), str2.lower()).ratio()
25+
26+
def remove_honorific_prefix(word):
27+
"""Remove お or ご prefix"""
28+
if word.startswith('お'):
29+
return word[1:]
30+
if word.startswith('ご'):
31+
return word[1:]
32+
return word
33+
34+
def find_synonym_groups(vocabulary):
35+
"""
36+
Find groups of words that are synonyms or alternative writings.
37+
38+
Returns:
39+
List of groups, where each group is a list of related words
40+
"""
41+
# Filter only nouns (since collocations involve nouns)
42+
nouns = [w for w in vocabulary if w['type'] == 'noun']
43+
44+
print(f"Analyzing {len(nouns)} nouns for synonym groups...")
45+
46+
groups = []
47+
used_words = set()
48+
49+
for i, word1 in enumerate(nouns):
50+
if word1['japanese'] in used_words:
51+
continue
52+
53+
group = [word1]
54+
55+
for word2 in nouns[i+1:]:
56+
if word2['japanese'] in used_words:
57+
continue
58+
59+
# Check for various types of relationships
60+
is_related = False
61+
relationship_type = None
62+
63+
# 1. Honorific vs plain (お弁当 vs 弁当)
64+
w1_no_hon = remove_honorific_prefix(word1['japanese'])
65+
w2_no_hon = remove_honorific_prefix(word2['japanese'])
66+
67+
if w1_no_hon == w2_no_hon and w1_no_hon != word1['japanese'] and w2_no_hon != word2['japanese']:
68+
is_related = True
69+
relationship_type = "honorific_vs_plain"
70+
elif w1_no_hon == word2['japanese'] or w2_no_hon == word1['japanese']:
71+
is_related = True
72+
relationship_type = "honorific_vs_plain"
73+
74+
# 2. Very similar English meanings (synonym)
75+
if not is_related:
76+
similarity = calculate_similarity(word1['english'], word2['english'])
77+
if similarity >= 0.85: # High similarity threshold
78+
is_related = True
79+
relationship_type = "english_synonym"
80+
81+
# 3. Same reading, different kanji
82+
if not is_related:
83+
if word1['reading'] == word2['reading'] and word1['japanese'] != word2['japanese']:
84+
# Check if English is also similar
85+
similarity = calculate_similarity(word1['english'], word2['english'])
86+
if similarity >= 0.6: # Moderate similarity for same reading
87+
is_related = True
88+
relationship_type = "same_reading_different_kanji"
89+
90+
if is_related:
91+
group.append(word2)
92+
used_words.add(word2['japanese'])
93+
94+
95+
if len(group) > 1:
96+
# Calculate all pairwise English similarities within group
97+
similarities = []
98+
for j, w1 in enumerate(group):
99+
for w2 in group[j+1:]:
100+
sim = calculate_similarity(w1['english'], w2['english'])
101+
similarities.append(sim)
102+
103+
avg_similarity = sum(similarities) / len(similarities) if similarities else 0
104+
105+
groups.append({
106+
'words': group,
107+
'count': len(group),
108+
'average_similarity': avg_similarity,
109+
'examples': [
110+
{
111+
'japanese': w['japanese'],
112+
'reading': w['reading'],
113+
'english': w['english']
114+
} for w in group
115+
]
116+
})
117+
used_words.add(word1['japanese'])
118+
119+
return groups
120+
121+
def main():
122+
"""Main function to find and save synonym groups"""
123+
vocabulary = load_vocabulary()
124+
125+
groups = find_synonym_groups(vocabulary)
126+
127+
# Sort by group size (largest first)
128+
groups.sort(key=lambda g: g['count'], reverse=True)
129+
130+
# Prepare output
131+
output = {
132+
"total_groups": len(groups),
133+
"total_words_in_groups": sum(g['count'] for g in groups),
134+
"groups": groups
135+
}
136+
137+
# Save to JSON
138+
output_path = Path("data-preparation/synonym_groups.json")
139+
with open(output_path, 'w', encoding='utf-8') as f:
140+
json.dump(output, f, ensure_ascii=False, indent=2)
141+
142+
print(f"\n[OK] Found {len(groups)} synonym groups")
143+
print(f" Total words involved: {output['total_words_in_groups']}")
144+
print(f" Output saved to: {output_path}")
145+
146+
# Print some examples
147+
print("\n=== Top 10 Synonym Groups ===")
148+
for i, group in enumerate(groups[:10], 1):
149+
print(f"\n{i}. Group of {group['count']} words (similarity: {group['average_similarity']:.2%}):")
150+
for example in group['examples']:
151+
print(f" - {example['japanese']} ({example['reading']}) = {example['english']}")
152+
153+
if __name__ == "__main__":
154+
main()

0 commit comments

Comments
 (0)