You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

54 lines
2.1 KiB

import wordninja
import re
from typing import List
from utility.data import POKEMON_PROPER_NOUNS
class PokemonWordNinja:
def __init__(self, custom_word_list: List[str] = None):
custom_words = POKEMON_PROPER_NOUNS
if custom_word_list:
custom_words = custom_words | set(custom_word_list)
self.custom_words = []
self.placeholder_map = {}
self.word_to_placeholder_map = {}
if custom_words:
# Store custom words with original capitalization, sorted by length
self.custom_words = sorted(custom_words, key=len, reverse=True)
for word in self.custom_words:
# Generate a unique placeholder
placeholder = f"__PLACEHOLDER_{hash(word)}__"
self.placeholder_map[placeholder] = word
self.word_to_placeholder_map[word] = placeholder
def add_custom_word(self, word: str):
words = self.custom_words
words.append(word)
self.custom_words = sorted(words, key=len, reverse=True)
placeholder = f"__PLACEHOLDER_{hash(word)}__"
self.placeholder_map[placeholder] = word
self.word_to_placeholder_map[word] = placeholder
def split(self, text: str) -> str:
working_text = text
# First handle exact custom words to preserve capitalization
for word in self.custom_words:
placeholder = self.word_to_placeholder_map[word]
pattern = re.compile(re.escape(word), re.IGNORECASE)
working_text = pattern.sub(placeholder, working_text)
# Clean up spaces
working_text = ' '.join(working_text.split())
# For remaining text, use wordninja
parts = []
for part in working_text.split():
if part in self.placeholder_map:
# Replace placeholder with the original word
parts.append(self.placeholder_map[part])
else:
split_parts = wordninja.split(part)
parts.extend(split_parts)
return ' '.join(parts)