import wordninja import re from typing import List from utility.data import POKEMON_PROPER_NOUNS class PokemonWordNinja: def __init__(self, custom_word_list: List[str] = None): custom_words = POKEMON_PROPER_NOUNS if custom_word_list: custom_words = custom_words | set(custom_word_list) self.custom_words = [] self.placeholder_map = {} self.word_to_placeholder_map = {} if custom_words: # Store custom words with original capitalization, sorted by length self.custom_words = sorted(custom_words, key=len, reverse=True) for word in self.custom_words: # Generate a unique placeholder placeholder = f"__PLACEHOLDER_{hash(word)}__" self.placeholder_map[placeholder] = word self.word_to_placeholder_map[word] = placeholder def add_custom_word(self, word: str): words = self.custom_words words.append(word) self.custom_words = sorted(words, key=len, reverse=True) placeholder = f"__PLACEHOLDER_{hash(word)}__" self.placeholder_map[placeholder] = word self.word_to_placeholder_map[word] = placeholder def split(self, text: str) -> str: working_text = text working_text = working_text.replace("-", " ") # First handle exact custom words to preserve capitalization for word in self.custom_words: # Use word boundaries to make sure we only match full words pattern = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE) if pattern.search(working_text): placeholder = self.word_to_placeholder_map[word] working_text = pattern.sub(placeholder, working_text) # Clean up spaces working_text = ' '.join(working_text.split()) # For remaining text, use wordninja parts = [] for part in working_text.split(): if part in self.placeholder_map: # Replace placeholder with the original word parts.append(self.placeholder_map[part]) else: split_parts = wordninja.split(part) parts.extend(split_parts) return ' '.join(parts)