Skip to content

Commit 1d4f22d

Browse files
committed
Implement multi-language mapping
1 parent 5a37def commit 1d4f22d

3 files changed

Lines changed: 406 additions & 48 deletions

File tree

lib/interscript.rb

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# frozen_string_literal: true
2+
3+
require "interscript/mapping"
4+
5+
# Transliteration
6+
module Interscript
7+
class InvalidSystemError < StandardError; end
8+
class ExternalProcessNotRecognizedError < StandardError; end
9+
class ExternalProcessUnavailableError < StandardError; end
10+
11+
if RUBY_ENGINE == 'opal'
12+
require "interscript/opal"
13+
extend Opal
14+
else
15+
require "interscript/fs"
16+
extend Fs
17+
end
18+
19+
class << self
20+
21+
def transliterate(system_code, string, maps={}, options={})
22+
system_code = map_resolve(system_code)
23+
24+
unless maps.has_key? system_code
25+
maps[system_code] = Interscript::Mapping.for(system_code)
26+
end
27+
28+
mapping = maps[system_code]
29+
30+
# First, apply chained transliteration as specified in the list `chain`
31+
chain = mapping.chain.dup
32+
while chain.length > 0
33+
string = transliterate(chain.shift, string, maps)
34+
end
35+
36+
# Then, apply the rest of the map
37+
separator = mapping.character_separator || ""
38+
word_separator = mapping.word_separator || ""
39+
title_case = mapping.title_case
40+
downcase = mapping.downcase
41+
42+
charmap = mapping.characters_hash
43+
dictmap = mapping.dictionary_hash
44+
trie = mapping.dictionary_trie
45+
language = options[:language] || mapping.language
46+
47+
string = external_processing(mapping, string)
48+
49+
pos = 0
50+
while pos < string.to_s.size
51+
m = 0
52+
wordmatch = ""
53+
54+
# Using Trie, find the longest matching substring
55+
while (pos + m < string.to_s.size) && (trie.partial_word?string[pos..pos+m])
56+
wordmatch = string[pos..pos+m] if trie.word?string[pos..pos+m]
57+
m += 1
58+
end
59+
60+
m = wordmatch.length
61+
if m > 0
62+
repl = dictmap[string[pos..pos+m-1]]
63+
string = sub_replace(string, pos, m, repl)
64+
pos += repl.length
65+
else
66+
pos += 1
67+
end
68+
end
69+
70+
output = string.clone
71+
offsets = Array.new string.to_s.size, 1
72+
73+
mapping.rules.each do |r|
74+
next unless r["language"].nil? || r["language"].include?(language)
75+
next unless output
76+
re = mkregexp(r["pattern"])
77+
output = output.gsub(re, r["result"])
78+
end
79+
80+
charmap.each do |k, v|
81+
re = mkregexp(k)
82+
while (match = output&.match(re))
83+
pos = match.offset(0).first
84+
result = !downcase && up_case_around?(output, pos) ? v.upcase : v
85+
86+
# if more than one, choose the first one
87+
result = result[0] if result.is_a?(Array)
88+
89+
output = sub_replace(
90+
output,
91+
pos,
92+
match[0].size,
93+
add_separator(separator, pos, result)
94+
)
95+
end
96+
end
97+
98+
mapping.postrules.each do |r|
99+
next unless output
100+
re = mkregexp(r["pattern"])
101+
output = if r["result"] == "upcase"
102+
output.gsub(re, &:upcase)
103+
else
104+
output.gsub(re, r["result"])
105+
end
106+
end
107+
108+
return unless output
109+
110+
re = mkregexp('^(.)')
111+
output = output.gsub(re, &:upcase) if title_case
112+
if word_separator != ''
113+
re = mkregexp("#{word_separator}#{separator}")
114+
output = output.gsub(re, word_separator)
115+
116+
if title_case
117+
re = mkregexp("#{word_separator}(.)")
118+
output = output.gsub(re, &:upcase)
119+
end
120+
end
121+
122+
output.unicode_normalize
123+
end
124+
125+
def map_resolve(map)
126+
map = aliases[map] if aliases.key? map
127+
raise ArgumentError, "Map #{map} doesn't exist" unless map_exist? map
128+
map
129+
end
130+
131+
private
132+
133+
def add_separator(separator, pos, result)
134+
pos == 0 ? result : separator + result
135+
end
136+
137+
def up_case_around?(string, pos)
138+
return false if string[pos] == string[pos].downcase
139+
140+
i = pos - 1
141+
i -= 1 while i.positive? && string[i] !~ mkregexp('[[:alpha:]]')
142+
before = i >= 0 && i < pos ? string[i].to_s.strip : ''
143+
144+
i = pos + 1
145+
i += 1 while i < string.size - 1 && string[i] !~ mkregexp('[[:alpha:]]')
146+
after = i > pos ? string[i].to_s.strip : ''
147+
148+
before_uc = !before.empty? && before == before.upcase
149+
after_uc = !after.empty? && after == after.upcase
150+
# before_uc && (after.empty? || after_uc) || after_uc && (before.empty? || before_uc)
151+
before_uc || after_uc
152+
end
153+
154+
end
155+
end

0 commit comments

Comments
 (0)