You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
54 lines
2.1 KiB
54 lines
2.1 KiB
import wordninja
|
|
import re
|
|
from typing import List
|
|
from utility.data import POKEMON_PROPER_NOUNS
|
|
|
|
class PokemonWordNinja:
|
|
def __init__(self, custom_word_list: List[str] = None):
|
|
custom_words = POKEMON_PROPER_NOUNS
|
|
if custom_word_list:
|
|
custom_words = custom_words | set(custom_word_list)
|
|
|
|
self.custom_words = []
|
|
self.placeholder_map = {}
|
|
self.word_to_placeholder_map = {}
|
|
if custom_words:
|
|
# Store custom words with original capitalization, sorted by length
|
|
self.custom_words = sorted(custom_words, key=len, reverse=True)
|
|
for word in self.custom_words:
|
|
# Generate a unique placeholder
|
|
placeholder = f"__PLACEHOLDER_{hash(word)}__"
|
|
self.placeholder_map[placeholder] = word
|
|
self.word_to_placeholder_map[word] = placeholder
|
|
|
|
def add_custom_word(self, word: str):
|
|
words = self.custom_words
|
|
words.append(word)
|
|
self.custom_words = sorted(words, key=len, reverse=True)
|
|
placeholder = f"__PLACEHOLDER_{hash(word)}__"
|
|
self.placeholder_map[placeholder] = word
|
|
self.word_to_placeholder_map[word] = placeholder
|
|
|
|
def split(self, text: str) -> str:
|
|
working_text = text
|
|
|
|
# First handle exact custom words to preserve capitalization
|
|
for word in self.custom_words:
|
|
placeholder = self.word_to_placeholder_map[word]
|
|
pattern = re.compile(re.escape(word), re.IGNORECASE)
|
|
working_text = pattern.sub(placeholder, working_text)
|
|
|
|
# Clean up spaces
|
|
working_text = ' '.join(working_text.split())
|
|
|
|
# For remaining text, use wordninja
|
|
parts = []
|
|
for part in working_text.split():
|
|
if part in self.placeholder_map:
|
|
# Replace placeholder with the original word
|
|
parts.append(self.placeholder_map[part])
|
|
else:
|
|
split_parts = wordninja.split(part)
|
|
parts.extend(split_parts)
|
|
|
|
return ' '.join(parts)
|