6 changed files with 75 additions and 6 deletions
@ -0,0 +1,54 @@ |
|||
import wordninja |
|||
import re |
|||
from typing import List |
|||
from utility.data import POKEMON_PROPER_NOUNS |
|||
|
|||
class PokemonWordNinja: |
|||
def __init__(self, custom_word_list: List[str] = None): |
|||
custom_words = POKEMON_PROPER_NOUNS |
|||
if custom_word_list: |
|||
custom_words = custom_words | set(custom_word_list) |
|||
|
|||
self.custom_words = [] |
|||
self.placeholder_map = {} |
|||
self.word_to_placeholder_map = {} |
|||
if custom_words: |
|||
# Store custom words with original capitalization, sorted by length |
|||
self.custom_words = sorted(custom_words, key=len, reverse=True) |
|||
for word in self.custom_words: |
|||
# Generate a unique placeholder |
|||
placeholder = f"__PLACEHOLDER_{hash(word)}__" |
|||
self.placeholder_map[placeholder] = word |
|||
self.word_to_placeholder_map[word] = placeholder |
|||
|
|||
def add_custom_word(self, word: str): |
|||
words = self.custom_words |
|||
words.append(word) |
|||
self.custom_words = sorted(words, key=len, reverse=True) |
|||
placeholder = f"__PLACEHOLDER_{hash(word)}__" |
|||
self.placeholder_map[placeholder] = word |
|||
self.word_to_placeholder_map[word] = placeholder |
|||
|
|||
def split(self, text: str) -> str: |
|||
working_text = text |
|||
|
|||
# First handle exact custom words to preserve capitalization |
|||
for word in self.custom_words: |
|||
placeholder = self.word_to_placeholder_map[word] |
|||
pattern = re.compile(re.escape(word), re.IGNORECASE) |
|||
working_text = pattern.sub(placeholder, working_text) |
|||
|
|||
# Clean up spaces |
|||
working_text = ' '.join(working_text.split()) |
|||
|
|||
# For remaining text, use wordninja |
|||
parts = [] |
|||
for part in working_text.split(): |
|||
if part in self.placeholder_map: |
|||
# Replace placeholder with the original word |
|||
parts.append(self.placeholder_map[part]) |
|||
else: |
|||
split_parts = wordninja.split(part) |
|||
parts.extend(split_parts) |
|||
|
|||
return ' '.join(parts) |
|||
Loading…
Reference in new issue