6 changed files with 75 additions and 6 deletions
@ -0,0 +1,54 @@ |
|||||
|
import wordninja |
||||
|
import re |
||||
|
from typing import List |
||||
|
from utility.data import POKEMON_PROPER_NOUNS |
||||
|
|
||||
|
class PokemonWordNinja: |
||||
|
def __init__(self, custom_word_list: List[str] = None): |
||||
|
custom_words = POKEMON_PROPER_NOUNS |
||||
|
if custom_word_list: |
||||
|
custom_words = custom_words | set(custom_word_list) |
||||
|
|
||||
|
self.custom_words = [] |
||||
|
self.placeholder_map = {} |
||||
|
self.word_to_placeholder_map = {} |
||||
|
if custom_words: |
||||
|
# Store custom words with original capitalization, sorted by length |
||||
|
self.custom_words = sorted(custom_words, key=len, reverse=True) |
||||
|
for word in self.custom_words: |
||||
|
# Generate a unique placeholder |
||||
|
placeholder = f"__PLACEHOLDER_{hash(word)}__" |
||||
|
self.placeholder_map[placeholder] = word |
||||
|
self.word_to_placeholder_map[word] = placeholder |
||||
|
|
||||
|
def add_custom_word(self, word: str): |
||||
|
words = self.custom_words |
||||
|
words.append(word) |
||||
|
self.custom_words = sorted(words, key=len, reverse=True) |
||||
|
placeholder = f"__PLACEHOLDER_{hash(word)}__" |
||||
|
self.placeholder_map[placeholder] = word |
||||
|
self.word_to_placeholder_map[word] = placeholder |
||||
|
|
||||
|
def split(self, text: str) -> str: |
||||
|
working_text = text |
||||
|
|
||||
|
# First handle exact custom words to preserve capitalization |
||||
|
for word in self.custom_words: |
||||
|
placeholder = self.word_to_placeholder_map[word] |
||||
|
pattern = re.compile(re.escape(word), re.IGNORECASE) |
||||
|
working_text = pattern.sub(placeholder, working_text) |
||||
|
|
||||
|
# Clean up spaces |
||||
|
working_text = ' '.join(working_text.split()) |
||||
|
|
||||
|
# For remaining text, use wordninja |
||||
|
parts = [] |
||||
|
for part in working_text.split(): |
||||
|
if part in self.placeholder_map: |
||||
|
# Replace placeholder with the original word |
||||
|
parts.append(self.placeholder_map[part]) |
||||
|
else: |
||||
|
split_parts = wordninja.split(part) |
||||
|
parts.extend(split_parts) |
||||
|
|
||||
|
return ' '.join(parts) |
||||
Loading…
Reference in new issue