export_source_strings.py (24975B)
1 #!/usr/bin/env python3 2 """ 3 Export US English (en-US) strings defined in tr! and tr_plural! macros in Rust code 4 by generating a main.ftl file that can be used for translating into other languages. 5 6 This script also creates a Psuedolocalized English (en-XA) main.ftl file with a given number of characters accented, 7 so that developers can easily detect which strings have been internationalized or not without needing to have 8 actual translations for a non-English language instead. 9 """ 10 11 import os 12 import re 13 import argparse 14 from pathlib import Path 15 from typing import Set, Dict, List, Tuple 16 import json 17 import collections 18 import hashlib 19 20 def find_rust_files(project_root: Path) -> List[Path]: 21 """Find all Rust files in the project.""" 22 rust_files = [] 23 for root, dirs, files in os.walk(project_root): 24 # Skip irrelevant directories 25 dirs[:] = [d for d in dirs if d not in ['target', '.git', '.cargo']] 26 27 for file in files: 28 # Find only Rust source files 29 if file.endswith('.rs'): 30 rust_files.append(Path(root) / file) 31 32 return rust_files 33 34 def strip_rust_comments(code: str) -> str: 35 """Remove // line comments, /* ... */ block comments, and doc comments (///, //!, //! ...) from Rust code.""" 36 # Remove block comments first 37 code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) 38 # Remove line comments 39 code = re.sub(r'//.*', '', code) 40 # Remove doc comments (/// and //! at start of line) 41 code = re.sub(r'^\s*///.*$', '', code, flags=re.MULTILINE) 42 code = re.sub(r'^\s*//!.*$', '', code, flags=re.MULTILINE) 43 return code 44 45 def extract_tr_macros_with_lines(content: str, file_path: str) -> dict: 46 """Extract tr! macro calls from Rust code with comments and line numbers. Handles multi-line macros.""" 47 matches = [] 48 # Strip comments before processing 49 content = strip_rust_comments(content) 50 # Search the entire content for tr! macro calls (multi-line aware) 51 for macro_content in extract_macro_calls(content, 'tr!'): 52 args = parse_macro_arguments(macro_content) 53 if len(args) >= 3: # Must have at least message and comment 54 message = args[1].strip() 55 comment = args[2].strip() # Second argument is always the comment 56 # Validate placeholders 57 if not validate_placeholders(message, file_path): 58 continue 59 if not any(skip in message.lower() for skip in [ 60 '/', '\\', '.ftl', '.rs', 'http', 'https', 'www', '@', 61 'crates/', 'src/', 'target/', 'build.rs']): 62 # Find the line number where this macro starts 63 macro_start = f'tr!({macro_content}' 64 idx = content.find(macro_start) 65 line_num = content[:idx].count('\n') + 1 if idx != -1 else 1 66 matches.append((message, comment, line_num, file_path)) 67 return matches 68 69 def extract_tr_plural_macros_with_lines(content: str, file_path: str) -> dict: 70 """Extract tr_plural! macro calls from Rust code with new signature and correct keying, skipping macro definitions and doc comments.""" 71 matches = [] 72 # Skip macro definitions 73 if 'macro_rules! tr_plural' in content or file_path.endswith('i18n/mod.rs'): 74 return matches 75 for idx, macro_content in enumerate(extract_macro_calls(content, 'tr_plural!')): 76 args = parse_macro_arguments(macro_content) 77 if len(args) >= 5: 78 one = args[1].strip() 79 other = args[2].strip() 80 comment = args[3].strip() 81 key = other 82 if key and not key.startswith('//') and not key.startswith('$'): 83 matches.append((key, comment, idx + 1, file_path)) 84 return matches 85 86 def parse_macro_arguments(content: str) -> List[str]: 87 """Parse macro arguments, handling quoted strings, param = value pairs, commas, and inline comments.""" 88 # Remove all // comments 89 content = re.sub(r'//.*', '', content) 90 # Collapse all whitespace/newlines to a single space 91 content = re.sub(r'\s+', ' ', content.strip()) 92 args = [] 93 i = 0 94 n = len(content) 95 while i < n: 96 # Skip whitespace 97 while i < n and content[i].isspace(): 98 i += 1 99 if i >= n: 100 break 101 # Handle quoted strings 102 if content[i] in ['"', "'"]: 103 quote_char = content[i] 104 i += 1 105 arg_start = i 106 while i < n: 107 if content[i] == '\\' and i + 1 < n: 108 i += 2 109 elif content[i] == quote_char: 110 break 111 else: 112 i += 1 113 arg = content[arg_start:i] 114 args.append(arg) 115 i += 1 # Skip closing quote 116 else: 117 arg_start = i 118 paren_count = 0 119 brace_count = 0 120 while i < n: 121 char = content[i] 122 if char == '(': 123 paren_count += 1 124 elif char == ')': 125 paren_count -= 1 126 elif char == '{': 127 brace_count += 1 128 elif char == '}': 129 brace_count -= 1 130 elif char == ',' and paren_count == 0 and brace_count == 0: 131 break 132 i += 1 133 arg = content[arg_start:i].strip() 134 if arg: 135 args.append(arg) 136 # Skip the comma if we found one 137 if i < n and content[i] == ',': 138 i += 1 139 return args 140 141 def extract_macro_calls(content: str, macro_name: str): 142 """Extract all macro calls of the given macro_name from the entire content, handling parentheses inside quoted strings and multi-line macros.""" 143 calls = [] 144 idx = 0 145 macro_start = f'{macro_name}(' 146 content_len = len(content) 147 while idx < content_len: 148 start = content.find(macro_start, idx) 149 if start == -1: 150 break 151 i = start + len(macro_start) 152 paren_count = 1 # Start after the initial '(' 153 in_quote = False 154 quote_char = '' 155 macro_content = '' 156 while i < content_len: 157 c = content[i] 158 if in_quote: 159 macro_content += c 160 if c == quote_char and (i == 0 or content[i-1] != '\\'): 161 in_quote = False 162 else: 163 if c in ('"', "'"): 164 in_quote = True 165 quote_char = c 166 macro_content += c 167 elif c == '(': 168 paren_count += 1 169 macro_content += c 170 elif c == ')': 171 paren_count -= 1 172 if paren_count == 0: 173 break 174 else: 175 macro_content += c 176 else: 177 macro_content += c 178 i += 1 179 # Only add if we found a closing parenthesis 180 if i < content_len and content[i] == ')': 181 calls.append(macro_content) 182 idx = i + 1 183 else: 184 # Malformed macro, skip past this occurrence 185 idx = start + len(macro_start) 186 return calls 187 188 def validate_placeholders(message: str, file_path: str = "") -> bool: 189 """Validate that all placeholders in a message are named and start with a letter.""" 190 import re 191 192 # Find all placeholders in the message 193 placeholder_pattern = r'\{([^}]*)\}' 194 placeholders = re.findall(placeholder_pattern, message) 195 196 valid = True 197 for placeholder in placeholders: 198 if not placeholder.strip(): 199 print(f"[VALIDATE] Warning: Empty placeholder {{}} found in message: '{message[:100]}...' {file_path}") 200 valid = False 201 elif not placeholder[0].isalpha(): 202 print(f"[VALIDATE] Warning: Placeholder '{{{placeholder}}}' does not start with a letter in message: '{message[:100]}...' {file_path}") 203 valid = False 204 if not valid: 205 print(f"[VALIDATE] Message rejected: '{message}'") 206 return valid 207 208 def extract_tr_macros(content: str) -> List[Tuple[str, str]]: 209 """Extract tr! macro calls from Rust code with comments.""" 210 filtered_matches = [] 211 # Strip comments before processing 212 content = strip_rust_comments(content) 213 # Process the entire content instead of line by line to handle multi-line macros 214 for macro_content in extract_macro_calls(content, 'tr!'): 215 args = parse_macro_arguments(macro_content) 216 if len(args) >= 3: # Must have at least message and comment 217 message = args[1].strip() 218 comment = args[2].strip() # Second argument is always the comment 219 # Debug output for identification strings 220 if "identification" in comment.lower(): 221 print(f"[DEBUG] Found identification tr! macro: message='{message}', comment='{comment}', args={args}") 222 norm_key = normalize_key(message, comment) 223 print(f"[DEBUG] Normalized key: '{norm_key}'") 224 # Validate placeholders 225 if not validate_placeholders(message): 226 continue 227 # More specific filtering logic 228 should_skip = False 229 for skip in ['/', '.ftl', '.rs', 'http', 'https', 'www', 'crates/', 'src/', 'target/', 'build.rs']: 230 if skip in message.lower(): 231 should_skip = True 232 break 233 # Special handling for @ - only skip if it looks like an actual email address 234 if '@' in message and ( 235 # Skip if it's a short string that looks like an email 236 len(message) < 50 or 237 # Skip if it contains common email patterns 238 any(pattern in message.lower() for pattern in ['@gmail.com', '@yahoo.com', '@hotmail.com', '@outlook.com']) 239 ): 240 should_skip = True 241 if not should_skip: 242 # Store as (message, comment) tuple to preserve all combinations 243 filtered_matches.append((message, comment)) 244 return filtered_matches 245 246 def extract_tr_plural_macros(content: str, file_path: str = "") -> Dict[str, dict]: 247 """Extract tr_plural! macro calls from Rust code with new signature, skipping macro definitions and doc comments.""" 248 filtered_matches = {} 249 # Skip macro definitions 250 if 'macro_rules! tr_plural' in content or file_path.endswith('i18n/mod.rs'): 251 print(f"[DEBUG] Skipping macro definitions in {file_path}") 252 return filtered_matches 253 for macro_content in extract_macro_calls(content, 'tr_plural!'): 254 print(f"[DEBUG] Found tr_plural! macro in {file_path}: {macro_content}") 255 args = parse_macro_arguments(macro_content) 256 print(f"[DEBUG] Parsed args: {args}") 257 if len(args) >= 5: 258 one = args[1].strip() 259 other = args[2].strip() 260 comment = args[3].strip() 261 key = other 262 if key and not key.startswith('//') and not key.startswith('$'): 263 print(f"[DEBUG] Adding plural key '{key}' from {file_path}") 264 filtered_matches[key] = { 265 'one': one, 266 'other': other, 267 'comment': comment 268 } 269 return filtered_matches 270 271 def escape_rust_placeholders(text: str) -> str: 272 """Convert Rust-style placeholders to Fluent-style placeholders""" 273 # Unescape double quotes first 274 text = text.replace('\\"', '"') 275 # Convert Rust placeholders to Fluent placeholders 276 return re.sub(r'\{([a-zA-Z][a-zA-Z0-9_]*)\}', r'{$\1}', text) 277 278 def simple_hash(s: str) -> str: 279 """Simple hash function using MD5 - matches Rust implementation, 4 hex chars""" 280 return hashlib.md5(s.encode('utf-8')).hexdigest()[:4] 281 282 def normalize_key(message, comment=None): 283 """Normalize a message to create a consistent key - matches Rust normalize_ftl_key function""" 284 # Remove quotes and normalize 285 key = message.strip('"\'') 286 # Unescape double quotes 287 key = key.replace('\\"', '"') 288 # Replace each invalid character with exactly one underscore (allow hyphens and underscores) 289 key = re.sub(r'[^a-zA-Z0-9_-]', '_', key) 290 # Remove leading/trailing underscores 291 key = key.strip('_') 292 # Add 'k_' prefix if the result doesn't start with a letter (Fluent requirement) 293 if not (key and key[0].isalpha()): 294 key = "k_" + key 295 296 # If we have a comment, append a hash of it to reduce collisions 297 if comment: 298 # Create a hash of the comment and append it to the key 299 hash_str = f"_{simple_hash(comment)}" 300 key += hash_str 301 302 return key 303 304 def pseudolocalize(text: str) -> str: 305 """Convert English text to pseudolocalized text for testing.""" 306 # Common pseudolocalization patterns 307 replacements = { 308 'a': 'à', 'e': 'é', 'i': 'í', 'o': 'ó', 'u': 'ú', 309 'A': 'À', 'E': 'É', 'I': 'Í', 'O': 'Ó', 'U': 'Ú', 310 'n': 'ñ', 'N': 'Ñ', 'c': 'ç', 'C': 'Ç' 311 } 312 313 # First, protect Fluent placeables from pseudolocalization 314 placeable_pattern = r'\{ *\$[a-zA-Z][a-zA-Z0-9_]* *\}' 315 placeables = re.findall(placeable_pattern, text) 316 317 # Replace placeables with unique placeholders that won't be modified 318 protected_text = text 319 for i, placeable in enumerate(placeables): 320 placeholder = f"<<PLACEABLE_{i}>>" 321 protected_text = protected_text.replace(placeable, placeholder, 1) 322 323 # Apply character replacements, skipping <<PLACEABLE_n>> 324 result = '' 325 i = 0 326 while i < len(protected_text): 327 if protected_text.startswith('<<PLACEABLE_', i): 328 end = protected_text.find('>>', i) 329 if end != -1: 330 result += protected_text[i:end+2] 331 i = end + 2 332 continue 333 char = protected_text[i] 334 result += replacements.get(char, char) 335 i += 1 336 337 # Restore placeables 338 for i, placeable in enumerate(placeables): 339 placeholder = f"<<PLACEABLE_{i}>>" 340 result = result.replace(placeholder, placeable) 341 342 # Wrap pseudolocalized string with square brackets so that it can be distinguished from other strings 343 return f'{{"["}}{result}{{"]"}}' 344 345 def generate_ftl_content(tr_strings: Dict[str, str], 346 plural_strings: Dict[str, dict], 347 tr_occurrences: Dict[Tuple[str, str], list], 348 plural_occurrences: Dict[Tuple[str, str], list], 349 pseudolocalize_content: bool = False) -> str: 350 """Generate FTL file content from extracted strings with comments.""" 351 352 lines = [ 353 "# Main translation file for Notedeck", 354 "# This file contains common UI strings used throughout the application", 355 "# Auto-generated by extract_i18n.py - DO NOT EDIT MANUALLY", 356 "", 357 ] 358 359 # Sort strings for consistent output 360 sorted_tr = sorted(tr_strings.items(), key=lambda item: item[0].lower()) 361 sorted_plural = sorted(plural_strings.items(), key=lambda item: item[0].lower()) 362 363 # Add regular tr! strings 364 if sorted_tr: 365 lines.append("# Regular strings") 366 for norm_key, (original_message, comment) in sorted_tr: 367 lines.append("") 368 # Write the comment 369 if comment: 370 lines.append(f"# {comment}") 371 # Apply pseudolocalization if requested 372 value = escape_rust_placeholders(original_message) 373 value = pseudolocalize(value) if pseudolocalize_content else value 374 lines.append(f"{norm_key} = {value}") 375 lines.append("") 376 377 # Add pluralized strings 378 if sorted_plural: 379 lines.append("# Pluralized strings") 380 for key, data in sorted_plural: 381 lines.append("") 382 383 one = data['one'] 384 other = data['other'] 385 comment = data['comment'] 386 # Write comment 387 if comment: 388 lines.append(f"# {comment}") 389 norm_key = normalize_key(key, comment) 390 one_val = escape_rust_placeholders(one) 391 other_val = escape_rust_placeholders(other) 392 if pseudolocalize_content: 393 one_val = pseudolocalize(one_val) 394 other_val = pseudolocalize(other_val) 395 lines.append(f'{norm_key} =') 396 lines.append(f' {{ $count ->') 397 lines.append(f' [one] {one_val}') 398 lines.append(f' *[other] {other_val}') 399 lines.append(f' }}') 400 lines.append("") 401 402 return "\n".join(lines) 403 404 def read_existing_ftl(ftl_path: Path) -> Dict[str, str]: 405 """Read existing FTL file to preserve comments and custom translations.""" 406 if not ftl_path.exists(): 407 return {} 408 409 existing_translations = {} 410 with open(ftl_path, 'r', encoding='utf-8') as f: 411 content = f.read() 412 413 # Extract key-value pairs 414 pattern = r'^([^#\s][^=]*?)\s*=\s*(.+)$' 415 for line in content.split('\n'): 416 match = re.match(pattern, line.strip()) 417 if match: 418 key = match.group(1).strip() 419 value = match.group(2).strip() 420 # For existing FTL files, we need to handle keys that may have hash suffixes 421 # Strip the hash suffix if present (8 hex characters after underscore) 422 original_key = re.sub(r'_[0-9a-f]{8}$', '', key) 423 norm_key = normalize_key(original_key) 424 existing_translations[norm_key] = value 425 426 return existing_translations 427 428 def main(): 429 parser = argparse.ArgumentParser(description='Extract i18n macros and generate FTL file') 430 parser.add_argument('--project-root', type=str, default='.', 431 help='Project root directory (default: current directory)') 432 parser.add_argument('--dry-run', action='store_true', 433 help='Show what would be generated without writing to file') 434 parser.add_argument('--fail-on-collisions', action='store_true', 435 help='Exit with error if key collisions are detected') 436 437 args = parser.parse_args() 438 439 project_root = Path(args.project_root) 440 441 print(f"Scanning Rust files in {project_root}...") 442 443 # Find all Rust files 444 rust_files = find_rust_files(project_root) 445 print(f"Found {len(rust_files)} Rust files") 446 447 # Extract strings from all files 448 all_tr_strings = {} 449 all_plural_strings = {} 450 451 # Track normalized keys to detect actual key collisions 452 all_tr_normalized_keys = {} 453 all_plural_normalized_keys = {} 454 455 # Track collisions 456 tr_collisions = {} 457 plural_collisions = {} 458 459 # Track all occurrences for intra-file collision detection 460 tr_occurrences = collections.defaultdict(list) 461 plural_occurrences = collections.defaultdict(list) 462 463 for rust_file in rust_files: 464 try: 465 with open(rust_file, 'r', encoding='utf-8') as f: 466 content = f.read() 467 468 # For intra-file collision detection 469 tr_lines = extract_tr_macros_with_lines(content, str(rust_file)) 470 for key, comment, line, file_path in tr_lines: 471 tr_occurrences[(file_path, key)].append((comment, line)) 472 plural_lines = extract_tr_plural_macros_with_lines(content, str(rust_file)) 473 for key, comment, line, file_path in plural_lines: 474 plural_occurrences[(file_path, key)].append((comment, line)) 475 476 tr_strings = extract_tr_macros(content) 477 plural_strings = extract_tr_plural_macros(content, str(rust_file)) 478 479 if tr_strings or plural_strings: 480 print(f" {rust_file}: {len(tr_strings)} tr!, {len(plural_strings)} tr_plural!") 481 482 # Check for collisions in tr! strings using normalized keys 483 for message, comment in tr_strings: 484 norm_key = normalize_key(message, comment) 485 if norm_key in all_tr_normalized_keys: 486 # This is a real key collision (same normalized key) 487 if norm_key not in tr_collisions: 488 tr_collisions[norm_key] = [] 489 tr_collisions[norm_key].append((rust_file, all_tr_normalized_keys[norm_key])) 490 tr_collisions[norm_key].append((rust_file, comment)) 491 # Store by normalized key to preserve all unique combinations 492 all_tr_strings[norm_key] = (message, comment) 493 all_tr_normalized_keys[norm_key] = comment 494 495 # Check for collisions in plural strings using normalized keys 496 for key, data in plural_strings.items(): 497 comment = data['comment'] 498 norm_key = normalize_key(key, comment) 499 if norm_key in all_plural_normalized_keys: 500 # This is a real key collision (same normalized key) 501 if norm_key not in plural_collisions: 502 plural_collisions[norm_key] = [] 503 plural_collisions[norm_key].append((rust_file, all_plural_normalized_keys[norm_key])) 504 plural_collisions[norm_key].append((rust_file, data)) 505 all_plural_strings[key] = data 506 all_plural_normalized_keys[norm_key] = data 507 508 except Exception as e: 509 print(f"Error reading {rust_file}: {e}") 510 511 # Intra-file collision detection 512 has_intra_file_collisions = False 513 for (file_path, key), occurrences in tr_occurrences.items(): 514 comments = set(c for c, _ in occurrences) 515 if len(occurrences) > 1 and len(comments) > 1: 516 has_intra_file_collisions = True 517 print(f"\n⚠️ Intra-file key collision in {file_path} for '{key}':") 518 for comment, line in occurrences: 519 comment_text = f" (comment: '{comment}')" if comment else " (no comment)" 520 print(f" Line {line}{comment_text}") 521 for (file_path, key), occurrences in plural_occurrences.items(): 522 comments = set(c for c, _ in occurrences) 523 if len(occurrences) > 1 and len(comments) > 1: 524 has_intra_file_collisions = True 525 print(f"\n⚠️ Intra-file key collision in {file_path} for '{key}':") 526 for comment, line in occurrences: 527 comment_text = f" (comment: '{comment}')" if comment else " (no comment)" 528 print(f" Line {line}{comment_text}") 529 if has_intra_file_collisions and args.fail_on_collisions: 530 print(f"❌ Exiting due to intra-file key collisions (--fail-on-collisions flag)") 531 exit(1) 532 533 # Report collisions 534 has_collisions = False 535 536 if tr_collisions: 537 has_collisions = True 538 print(f"\n⚠️ Key collisions detected in tr! strings:") 539 for key, collisions in tr_collisions.items(): 540 print(f" '{key}':") 541 for file_path, comment in collisions: 542 comment_text = f" (comment: '{comment}')" if comment else " (no comment)" 543 print(f" {file_path}{comment_text}") 544 545 if plural_collisions: 546 has_collisions = True 547 print(f"\n⚠️ Key collisions detected in tr_plural! strings:") 548 for key, collisions in plural_collisions.items(): 549 print(f" '{key}':") 550 for file_path, comment in collisions: 551 comment_text = f" (comment: '{comment}')" if comment else " (no comment)" 552 print(f" {file_path}{comment_text}") 553 554 if has_collisions: 555 print(f"\n💡 Collision resolution: The last occurrence of each key will be used.") 556 if args.fail_on_collisions: 557 print(f"❌ Exiting due to key collisions (--fail-on-collisions flag)") 558 exit(1) 559 560 print(f"\nExtracted strings:") 561 print(f" Regular strings: {len(all_tr_strings)}") 562 print(f" Plural strings: {len(all_plural_strings)}") 563 564 # Debug: print all keys in all_tr_strings 565 print("[DEBUG] All tr! keys:") 566 for k in all_tr_strings.keys(): 567 print(f" {k}") 568 569 # Generate FTL content for both locales 570 locales = ['en-US', 'en-XA'] 571 572 for locale in locales: 573 pseudolocalize_content = (locale == 'en-XA') 574 ftl_content = generate_ftl_content(all_tr_strings, all_plural_strings, tr_occurrences, plural_occurrences, pseudolocalize_content) 575 output_path = Path(f'assets/translations/{locale}/main.ftl') 576 577 if args.dry_run: 578 print(f"\n--- Generated FTL content for {locale} ---") 579 print(ftl_content) 580 print(f"--- End of content for {locale} ---") 581 else: 582 # Ensure output directory exists 583 output_path.parent.mkdir(parents=True, exist_ok=True) 584 585 # Write to file 586 with open(output_path, 'w', encoding='utf-8') as f: 587 f.write(ftl_content) 588 589 print(f"\nGenerated FTL file: {output_path}") 590 591 if not args.dry_run: 592 print(f"\nTotal strings: {len(all_tr_strings) + len(all_plural_strings)}") 593 594 if __name__ == '__main__': 595 main()