android_xml_translator/android_xml_translator.py at main · Heitezy/android_xml_translator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
#!/usr/bin/env python3
"""
Android strings.xml Translator

This script translates Android string resources from a strings.xml file
to multiple languages using free online translation services.
No API keys or authentication required.

Features:
- Respects translatable="false" attribute
- Handles string-array elements
- Handles plurals elements
- Preserves formatting placeholders like %s, %d, %1$s
- Preserves escape sequences like \n, \', \"
- Preserves regex patterns
- Multiple fallback translation services for reliability
- Optional transliteration instead of translation
- Parallel processing of multiple target languages
"""

import os
import re
import argparse
import html
import time
import random
import requests
import json
import xml.etree.ElementTree as ET
from urllib.parse import quote
import threading
import concurrent.futures

def extract_strings(xml_file):
    """Extract strings from an Android strings.xml file"""
    tree = ET.parse(xml_file)
    root = tree.getroot()

    strings = {}

    # Extract regular string elements
    for string_elem in root.findall("string"):
        name = string_elem.get("name")
        translatable = string_elem.get("translatable", "true").lower()

        if name and string_elem.text and translatable != "false":
            strings[f"string:{name}"] = string_elem.text

    # Extract string-array elements
    for array_elem in root.findall("string-array"):
        array_name = array_elem.get("name")
        translatable = array_elem.get("translatable", "true").lower()

        if array_name and translatable != "false":
            for i, item_elem in enumerate(array_elem.findall("item")):
                if item_elem.text:
                    strings[f"array:{array_name}:{i}"] = item_elem.text

    # Extract plurals elements
    for plurals_elem in root.findall("plurals"):
        plurals_name = plurals_elem.get("name")
        translatable = plurals_elem.get("translatable", "true").lower()

        if plurals_name and translatable != "false":
            for item_elem in plurals_elem.findall("item"):
                quantity = item_elem.get("quantity")
                if quantity and item_elem.text:
                    strings[f"plurals:{plurals_name}:{quantity}"] = item_elem.text

    return strings


def translate_text(text, source_lang, target_lang, transliterate=False):
    """Translate text using Google Translate (no API key required) while preserving placeholders"""
    if not text.strip():
        return text

    # Handle special case: if the text only consists of format specifiers or escape sequences, don't translate
    if re.match(r'^([%\\][\w\'"\n$]+)+$', text.strip()):
        return text

    # 1. Extract and store all special sequences that should not be translated
    # These will be replaced with unique tokens that won't be translated

    # Track placeholders with their positions
    placeholders = []
    placeholder_positions = []

    # Patterns to match:
    # - Format specifiers like %s, %d, %1$s
    # - Escaped chars like \n, \t, \', \"
    # - Unicode escapes like \u1234
    # - Common regex patterns
    pattern = r'%([0-9]+\$)?[sdif]|%[sdif]|\\\'|\\"|\\\n|\\n|\\t|\\r|\\b|\\u[0-9a-fA-F]{4}|\[[^\]]*\]|\{\d+\}|\{[a-zA-Z_]+\}'

    # Find all matches and their positions, also preserve surrounding spaces
    for match in re.finditer(pattern, text):
        start, end = match.span()
        placeholder = match.group(0)

        # Check for spaces before the placeholder
        leading_space = ""
        if start > 0 and text[start-1] == " ":
            leading_space = " "
            start -= 1

        # Check for spaces after the placeholder
        trailing_space = ""
        if end < len(text) and text[end] == " ":
            trailing_space = " "
            end += 1

        # Store the placeholder with its surrounding spaces
        placeholders.append(leading_space + placeholder + trailing_space)
        placeholder_positions.append((start, end))

    # If no special sequences found, translate the whole text normally
    if not placeholders:
        return _perform_translation(text, source_lang, target_lang, transliterate)

    # 2. Split the text into translatable segments and non-translatable tokens
    segments = []
    last_end = 0

    for i, (start, end) in enumerate(placeholder_positions):
        # Add text segment before the placeholder (if any)
        if start > last_end:
            segments.append(('text', text[last_end:start]))

        # Add the placeholder as a non-translatable token
        segments.append(('placeholder', placeholders[i]))
        last_end = end

    # Add any remaining text after the last placeholder
    if last_end < len(text):
        segments.append(('text', text[last_end:]))

    # 3. Translate only the text segments
    translated_segments = []

    # Collect all text segments for batch translation
    text_segments = [segment[1] for segment in segments if segment[0] == 'text']

    # If we have text to translate
    if text_segments:
        # Join with a special delimiter that's unlikely to appear in the text
        delimiter = "⟐⟐⟐SPLIT⟐⟐⟐"
        combined_text = delimiter.join(text_segments)

        # Translate the combined text
        translated_combined = _perform_translation(combined_text, source_lang, target_lang, transliterate)

        # Split the translated result back into segments
        translated_texts = translated_combined.split(delimiter)

        # If we didn't get the same number of segments back, fall back to translating individually
        if len(translated_texts) != len(text_segments):
            translated_texts = [_perform_translation(segment, source_lang, target_lang, transliterate) for segment in text_segments]
    else:
        translated_texts = []

    # 4. Reconstruct the text with translated segments and original placeholders
    result = ""
    text_segment_index = 0

    for segment_type, segment_value in segments:
        if segment_type == 'text':
            # Use the translated text segment
            if text_segment_index < len(translated_texts):
                result += translated_texts[text_segment_index]
                text_segment_index += 1
            else:
                result += segment_value  # Fallback if something went wrong
        else:
            # Use the original placeholder with its surrounding spaces
            result += segment_value

    # Check if spaces around placeholders were preserved correctly
    # If not, try to fix any missing spaces by checking for placeholder formats directly attached to words
    placeholder_pattern = r'(\w+)(%[0-9]*\$?[sdif])(\w+)'
    result = re.sub(placeholder_pattern, r'\1 \2 \3', result)

    return result


def _perform_translation(text, source_lang, target_lang, transliterate=False):
    """Actually perform the translation using Google Translate API"""
    if not text.strip():
        return text

    try:
        # Add delay to avoid rate limiting
        time.sleep(random.uniform(0.8, 2.0))

        # Use Google Translate without API key
        url = f"https://translate.googleapis.com/translate_a/single"

        params = {
            "client": "gtx",
            "sl": source_lang,
            "tl": target_lang,
            "q": text
        }

        # For transliteration, we need several data types
        if transliterate:
            # dt=t: translation
            # dt=rm: transliteration
            params["dt"] = ["t", "rm"]
        else:
            params["dt"] = "t"

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }

        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()

        # Parse the JSON response
        result = response.json()

        # Extract transliteration or translation from response
        if transliterate:
            # First, get the standard translation as fallback
            translation = ""
            for sentence in result[0]:
                if sentence and len(sentence) > 0 and sentence[0]:
                    translation += sentence[0]

            # For Crimean Tatar (crh) and other languages with Latin transliteration in specific position
            latin_transliteration = ""

            # Based on the debug output, the Latin transliteration is in result[0][i][2]
            # where i is the index of each sentence segment
            for i, sentence_data in enumerate(result[0]):
                if sentence_data and len(sentence_data) > 2 and sentence_data[2]:
                    latin_transliteration += sentence_data[2]

            # If we found a Latin transliteration, use it
            if latin_transliteration:
                return latin_transliteration

            # If no transliteration found in the expected position, fall back to other methods
            if not latin_transliteration:
                # Try other positions in the structure
                if len(result) >= 2 and result[1]:
                    for entry in result[1]:
                        if entry and len(entry) > 2 and entry[2]:
                            latin_transliteration += entry[2]

            # If we found a transliteration with any method, use it; otherwise return the translation
            if latin_transliteration:
                return latin_transliteration
            else:
                return translation
        else:
            # Normal translation
            translation = ""
            for sentence in result[0]:
                if sentence and len(sentence) > 0 and sentence[0]:
                    translation += sentence[0]
            return translation

    except requests.exceptions.RequestException as e:
        print(f"Translation error: {e}")
        # Fallback to another service if the first one fails
        return _fallback_translate(text, source_lang, target_lang, transliterate)


def _fallback_translate(text, source_lang, target_lang, transliterate=False):
    """Fallback translation method using DeepL's free website (no API key)"""
    # If transliteration is requested, we can't use the fallback services as they don't support this
    # So we'll just return the original text or attempt a standard translation
    if transliterate:
        print("Warning: Transliteration not supported by fallback services. Attempting regular translation.")

    try:
        # Add delay to avoid rate limiting
        time.sleep(random.uniform(1.5, 3.0))

        # DeepL uses slightly different language codes
        deepl_lang_codes = {
            'en': 'EN',
            'es': 'ES',
            'fr': 'FR',
            'de': 'DE',
            'it': 'IT',
            'pt': 'PT',
            'ru': 'RU',
            'ja': 'JA',
            'zh': 'ZH',
            'nl': 'NL',
            'pl': 'PL',
            # Add more as needed
        }

        src = deepl_lang_codes.get(source_lang, source_lang.upper())
        tgt = deepl_lang_codes.get(target_lang, target_lang.upper())

        # First we need to get cookies and authentication
        session = requests.Session()

        # Get initial cookies
        url = "https://www.deepl.com/translator"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.9',
            'Referer': 'https://www.deepl.com/',
            'Origin': 'https://www.deepl.com'
        }

        session.get(url, headers=headers)

        # Now make the translation request
        translate_url = "https://www2.deepl.com/jsonrpc"

        # Generate a random request ID
        request_id = random.randint(1000000, 9999999)

        payload = {
            "jsonrpc": "2.0",
            "method": "LMT_handle_texts",
            "params": {
                "texts": [{"text": text}],
                "lang": {
                    "source_lang_user_selected": src,
                    "target_lang": tgt
                },
                "timestamp": int(time.time() * 1000)
            },
            "id": request_id
        }

        response = session.post(translate_url, json=payload, headers=headers)
        response.raise_for_status()
        response_json = response.json()

        if "result" in response_json and "texts" in response_json["result"]:
            translation = response_json["result"]["texts"][0]["text"]
            return translation
        else:
            print("DeepL fallback translation failed. Trying MyMemory...")
            raise Exception("DeepL failed")

    except Exception as e:
        print(f"Fallback translation error: {e}")
        # If all fails, try a simpler third option
        try:
            # MyMemory translation API (free tier)
            time.sleep(random.uniform(1.0, 2.0))
            url = f"https://api.mymemory.translated.net/get?q={quote(text)}&langpair={source_lang}|{target_lang}"
            response = requests.get(url)
            response.raise_for_status()
            result = response.json()
            translation = result.get("responseData", {}).get("translatedText", text)
            return translation
        except Exception as e2:
            print(f"MyMemory fallback translation error: {e2}")
            return text  # Return original text if all translation attempts fail


def create_translated_xml(original_file, strings_dict, target_lang):
    """Create a new XML file with translated strings"""
    tree = ET.parse(original_file)
    root = tree.getroot()

    # Track string-arrays to update
    arrays_updated = set()

    # Track plurals to update
    plurals_updated = set()

    # Update regular strings
    for string_elem in root.findall("string"):
        name = string_elem.get("name")
        key = f"string:{name}"

        if key in strings_dict:
            string_elem.text = strings_dict[key]

    # Update string-arrays
    for array_elem in root.findall("string-array"):
        array_name = array_elem.get("name")

        # Check if this array has any translated items
        array_has_translations = False
        for i, item_elem in enumerate(array_elem.findall("item")):
            key = f"array:{array_name}:{i}"
            if key in strings_dict:
                array_has_translations = True
                break

        if array_has_translations:
            arrays_updated.add(array_name)
            # Update the items
            for i, item_elem in enumerate(array_elem.findall("item")):
                key = f"array:{array_name}:{i}"
                if key in strings_dict:
                    item_elem.text = strings_dict[key]

    # Update plurals
    for plurals_elem in root.findall("plurals"):
        plurals_name = plurals_elem.get("name")

        # Check if this plural has any translated items
        plurals_has_translations = False
        for item_elem in plurals_elem.findall("item"):
            quantity = item_elem.get("quantity")
            key = f"plurals:{plurals_name}:{quantity}"
            if key in strings_dict:
                plurals_has_translations = True
                break

        if plurals_has_translations:
            plurals_updated.add(plurals_name)
            # Update the items
            for item_elem in plurals_elem.findall("item"):
                quantity = item_elem.get("quantity")
                key = f"plurals:{plurals_name}:{quantity}"
                if key in strings_dict:
                    item_elem.text = strings_dict[key]

    # Create filename for the translated file
    base_name = os.path.basename(original_file)
    dir_name = os.path.dirname(original_file)
    translated_file = os.path.join(dir_name, f"strings-{target_lang}.xml")

    # Write the translated XML
    tree.write(translated_file, encoding='utf-8', xml_declaration=True)
    return translated_file


def translate_strings_for_language(strings, source_lang, target_lang, transliterate=False):
    """Translate all strings for a specific target language"""
    translated_strings = {}
    total = len(strings)

    # Progress tracking
    if transliterate:
        print(f"Starting transliteration from {source_lang} to {target_lang}...")
    else:
        print(f"Starting translation from {source_lang} to {target_lang}...")

    for current, (key, text) in enumerate(strings.items(), 1):
        # Determine string type for progress display
        if key.startswith("string:"):
            name = key.split(":", 1)[1]
            if current % 10 == 0 or current == total:  # Show progress every 10 items
                if transliterate:
                    print(f"[{target_lang}] Transliterating string ({current}/{total}): {name}")
                else:
                    print(f"[{target_lang}] Translating string ({current}/{total}): {name}")
        elif key.startswith("array:"):
            parts = key.split(":", 2)
            array_name = parts[1]
            item_index = parts[2]
            if current % 10 == 0 or current == total:  # Show progress every 10 items
                if transliterate:
                    print(f"[{target_lang}] Transliterating array item ({current}/{total}): {array_name}[{item_index}]")
                else:
                    print(f"[{target_lang}] Translating array item ({current}/{total}): {array_name}[{item_index}]")
        elif key.startswith("plurals:"):
            parts = key.split(":", 2)
            plurals_name = parts[1]
            quantity = parts[2]
            if current % 10 == 0 or current == total:  # Show progress every 10 items
                if transliterate:
                    print(f"[{target_lang}] Transliterating plural item ({current}/{total}): {plurals_name}[{quantity}]")
                else:
                    print(f"[{target_lang}] Translating plural item ({current}/{total}): {plurals_name}[{quantity}]")

        # Translate or transliterate the text
        translated_text = translate_text(text, source_lang, target_lang, transliterate)
        translated_strings[key] = translated_text

    return translated_strings

def process_language(input_file, source_lang, target_lang, strings, transliterate=False):
    """Process a single target language"""
    # Translate all strings for this language
    translated_strings = translate_strings_for_language(strings, source_lang, target_lang, transliterate)

    # Create translated XML file
    output_file_suffix = "translit-" + target_lang if transliterate else target_lang
    output_file = create_translated_xml(input_file, translated_strings, output_file_suffix)

    # Print completion message
    if transliterate:
        print(f"✓ Transliteration to {target_lang} completed! File saved as: {output_file}")
    else:
        print(f"✓ Translation to {target_lang} completed! File saved as: {output_file}")

    # Return statistics
    string_count = len([k for k in strings.keys() if k.startswith("string:")])
    array_items_count = len([k for k in strings.keys() if k.startswith("array:")])
    array_count = len(set([k.split(":", 2)[1] for k in strings.keys() if k.startswith("array:")]))
    plurals_items_count = len([k for k in strings.keys() if k.startswith("plurals:")])
    plurals_count = len(set([k.split(":", 2)[1] for k in strings.keys() if k.startswith("plurals:")]))

    return {
        "target_lang": target_lang,
        "string_count": string_count,
        "array_count": array_count,
        "array_items_count": array_items_count,
        "plurals_count": plurals_count,
        "plurals_items_count": plurals_items_count,
        "total_elements": len(strings),
        "output_file": output_file
    }

def main():
    parser = argparse.ArgumentParser(description='Translate Android strings.xml to multiple languages')
    parser.add_argument('input_file', help='Path to the original strings.xml file')
    parser.add_argument('source_lang', help='Source language code (e.g., en)')
    parser.add_argument('target_langs', nargs='+', help='One or more target language codes (e.g., fr es de)')
    parser.add_argument('--preserve', action='store_true', help='Preserve untranslated strings')
    parser.add_argument('--transliterate', action='store_true', help='Use transliteration instead of translation')
    parser.add_argument('--max-workers', type=int, default=3, help='Maximum number of parallel translation workers (default: 3)')
    args = parser.parse_args()

    if not os.path.isfile(args.input_file):
        print(f"Error: Input file '{args.input_file}' not found.")
        return

    print(f"Extracting strings from {args.input_file}...")
    strings = extract_strings(args.input_file)
    print(f"Found {len(strings)} translatable strings to process.")

    # Show summary of work to be done
    print(f"\nPreparing to process {len(args.target_langs)} target languages:")
    for lang in args.target_langs:
        if args.transliterate:
            print(f"- Transliterating from {args.source_lang} to {lang}")
        else:
            print(f"- Translating from {args.source_lang} to {lang}")

    print("\nStarting parallel processing...")

    # Create a thread pool executor
    max_workers = min(args.max_workers, len(args.target_langs))
    results = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks for each target language
        future_to_lang = {
            executor.submit(
                process_language,
                args.input_file,
                args.source_lang,
                target_lang,
                strings,
                args.transliterate
            ): target_lang for target_lang in args.target_langs
        }

        # Process results as they complete
        for future in concurrent.futures.as_completed(future_to_lang):
            target_lang = future_to_lang[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"Error processing {target_lang}: {e}")

    # Print final summary
    print("\n=== Translation Summary ===")
    for result in sorted(results, key=lambda x: x["target_lang"]):
        lang = result["target_lang"]
        print(f"\n{lang.upper()} ({result['output_file']}):")
        print(f"- Regular strings: {result['string_count']}")
        print(f"- String arrays: {result['array_count']} (with {result['array_items_count']} items)")
        print(f"- Plurals: {result['plurals_count']} (with {result['plurals_items_count']} items)")
        print(f"- Total processed elements: {result['total_elements']}")

    print("\nAll translations completed successfully!")


if __name__ == "__main__":
    main()