|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Find and group synonym/alternative writing collocation pairs. |
| 4 | +
|
| 5 | +This script identifies noun pairs that are synonyms or alternative writings, such as: |
| 6 | +- 晩ご飯 vs 夕飯 (dinner) |
| 7 | +- 弁当 vs お弁当 (lunch box, honorific vs plain) |
| 8 | +- Alternative kanji/hiragana writings |
| 9 | +""" |
| 10 | + |
| 11 | +import json |
| 12 | +from pathlib import Path |
| 13 | +from difflib import SequenceMatcher |
| 14 | + |
| 15 | +def load_vocabulary(): |
| 16 | + """Load vocabulary from JSON file""" |
| 17 | + vocab_path = Path("public/data/vocabulary.json") |
| 18 | + with open(vocab_path, 'r', encoding='utf-8') as f: |
| 19 | + data = json.load(f) |
| 20 | + return data['vocabulary'] |
| 21 | + |
| 22 | +def calculate_similarity(str1, str2): |
| 23 | + """Calculate similarity ratio between two strings""" |
| 24 | + return SequenceMatcher(None, str1.lower(), str2.lower()).ratio() |
| 25 | + |
| 26 | +def remove_honorific_prefix(word): |
| 27 | + """Remove お or ご prefix""" |
| 28 | + if word.startswith('お'): |
| 29 | + return word[1:] |
| 30 | + if word.startswith('ご'): |
| 31 | + return word[1:] |
| 32 | + return word |
| 33 | + |
| 34 | +def find_synonym_groups(vocabulary): |
| 35 | + """ |
| 36 | + Find groups of words that are synonyms or alternative writings. |
| 37 | +
|
| 38 | + Returns: |
| 39 | + List of groups, where each group is a list of related words |
| 40 | + """ |
| 41 | + # Filter only nouns (since collocations involve nouns) |
| 42 | + nouns = [w for w in vocabulary if w['type'] == 'noun'] |
| 43 | + |
| 44 | + print(f"Analyzing {len(nouns)} nouns for synonym groups...") |
| 45 | + |
| 46 | + groups = [] |
| 47 | + used_words = set() |
| 48 | + |
| 49 | + for i, word1 in enumerate(nouns): |
| 50 | + if word1['japanese'] in used_words: |
| 51 | + continue |
| 52 | + |
| 53 | + group = [word1] |
| 54 | + |
| 55 | + for word2 in nouns[i+1:]: |
| 56 | + if word2['japanese'] in used_words: |
| 57 | + continue |
| 58 | + |
| 59 | + # Check for various types of relationships |
| 60 | + is_related = False |
| 61 | + relationship_type = None |
| 62 | + |
| 63 | + # 1. Honorific vs plain (お弁当 vs 弁当) |
| 64 | + w1_no_hon = remove_honorific_prefix(word1['japanese']) |
| 65 | + w2_no_hon = remove_honorific_prefix(word2['japanese']) |
| 66 | + |
| 67 | + if w1_no_hon == w2_no_hon and w1_no_hon != word1['japanese'] and w2_no_hon != word2['japanese']: |
| 68 | + is_related = True |
| 69 | + relationship_type = "honorific_vs_plain" |
| 70 | + elif w1_no_hon == word2['japanese'] or w2_no_hon == word1['japanese']: |
| 71 | + is_related = True |
| 72 | + relationship_type = "honorific_vs_plain" |
| 73 | + |
| 74 | + # 2. Very similar English meanings (synonym) |
| 75 | + if not is_related: |
| 76 | + similarity = calculate_similarity(word1['english'], word2['english']) |
| 77 | + if similarity >= 0.85: # High similarity threshold |
| 78 | + is_related = True |
| 79 | + relationship_type = "english_synonym" |
| 80 | + |
| 81 | + # 3. Same reading, different kanji |
| 82 | + if not is_related: |
| 83 | + if word1['reading'] == word2['reading'] and word1['japanese'] != word2['japanese']: |
| 84 | + # Check if English is also similar |
| 85 | + similarity = calculate_similarity(word1['english'], word2['english']) |
| 86 | + if similarity >= 0.6: # Moderate similarity for same reading |
| 87 | + is_related = True |
| 88 | + relationship_type = "same_reading_different_kanji" |
| 89 | + |
| 90 | + if is_related: |
| 91 | + group.append(word2) |
| 92 | + used_words.add(word2['japanese']) |
| 93 | + |
| 94 | + |
| 95 | + if len(group) > 1: |
| 96 | + # Calculate all pairwise English similarities within group |
| 97 | + similarities = [] |
| 98 | + for j, w1 in enumerate(group): |
| 99 | + for w2 in group[j+1:]: |
| 100 | + sim = calculate_similarity(w1['english'], w2['english']) |
| 101 | + similarities.append(sim) |
| 102 | + |
| 103 | + avg_similarity = sum(similarities) / len(similarities) if similarities else 0 |
| 104 | + |
| 105 | + groups.append({ |
| 106 | + 'words': group, |
| 107 | + 'count': len(group), |
| 108 | + 'average_similarity': avg_similarity, |
| 109 | + 'examples': [ |
| 110 | + { |
| 111 | + 'japanese': w['japanese'], |
| 112 | + 'reading': w['reading'], |
| 113 | + 'english': w['english'] |
| 114 | + } for w in group |
| 115 | + ] |
| 116 | + }) |
| 117 | + used_words.add(word1['japanese']) |
| 118 | + |
| 119 | + return groups |
| 120 | + |
| 121 | +def main(): |
| 122 | + """Main function to find and save synonym groups""" |
| 123 | + vocabulary = load_vocabulary() |
| 124 | + |
| 125 | + groups = find_synonym_groups(vocabulary) |
| 126 | + |
| 127 | + # Sort by group size (largest first) |
| 128 | + groups.sort(key=lambda g: g['count'], reverse=True) |
| 129 | + |
| 130 | + # Prepare output |
| 131 | + output = { |
| 132 | + "total_groups": len(groups), |
| 133 | + "total_words_in_groups": sum(g['count'] for g in groups), |
| 134 | + "groups": groups |
| 135 | + } |
| 136 | + |
| 137 | + # Save to JSON |
| 138 | + output_path = Path("data-preparation/synonym_groups.json") |
| 139 | + with open(output_path, 'w', encoding='utf-8') as f: |
| 140 | + json.dump(output, f, ensure_ascii=False, indent=2) |
| 141 | + |
| 142 | + print(f"\n[OK] Found {len(groups)} synonym groups") |
| 143 | + print(f" Total words involved: {output['total_words_in_groups']}") |
| 144 | + print(f" Output saved to: {output_path}") |
| 145 | + |
| 146 | + # Print some examples |
| 147 | + print("\n=== Top 10 Synonym Groups ===") |
| 148 | + for i, group in enumerate(groups[:10], 1): |
| 149 | + print(f"\n{i}. Group of {group['count']} words (similarity: {group['average_similarity']:.2%}):") |
| 150 | + for example in group['examples']: |
| 151 | + print(f" - {example['japanese']} ({example['reading']}) = {example['english']}") |
| 152 | + |
| 153 | +if __name__ == "__main__": |
| 154 | + main() |
0 commit comments