From e5f109949a9416fa85017dc58ebc1995da711f01 Mon Sep 17 00:00:00 2001 From: Dan Date: Fri, 8 Nov 2024 16:06:35 +0000 Subject: [PATCH] - WIP reworking form p-arsing to be more consistant --- ui/main_window_controller.py | 1 + ui/workers/gather_evolutions_worker.py | 4 +- .../gather_home_storage_status_worker.py | 2 +- ui/workers/gather_pokemon_forms_worker.py | 4 ++ utility/data.py | 16 ++++-- utility/pokemon_word_ninja.py | 54 +++++++++++++++++++ 6 files changed, 75 insertions(+), 6 deletions(-) create mode 100644 utility/pokemon_word_ninja.py diff --git a/ui/main_window_controller.py b/ui/main_window_controller.py index 53240c6..01056e8 100644 --- a/ui/main_window_controller.py +++ b/ui/main_window_controller.py @@ -106,6 +106,7 @@ class MainWindowController: db.add_pokemon_form(pokemon["pfic"], pokemon["name"], pokemon["form_name"], pokemon["national_dex"], pokemon["generation"], pokemon["sprite_url"], pokemon["gender_relevant"]) self.pokemon_data_cache = data self.view.update_pokemon_forms(data) + self.apply_filters() db.save_changes() diff --git a/ui/workers/gather_evolutions_worker.py b/ui/workers/gather_evolutions_worker.py index 22d9c05..9dfbd11 100644 --- a/ui/workers/gather_evolutions_worker.py +++ b/ui/workers/gather_evolutions_worker.py @@ -28,7 +28,7 @@ class GatherEvolutions(QRunnable): except Exception as e: print(f"Error gathering Pokémon home storage status: {e}") - def gather_evolution_data(self, force_refresh = False): + def gather_evolution_data(self, force_refresh = True): all_pokemon_forms = db.get_list_of_pokemon_forms() evolutions = {} @@ -121,7 +121,7 @@ class GatherEvolutions(QRunnable): evolutions[composite_key] = (evolution_info) self.traverse_and_store(next_stage, evolutions, gender) - def parse_evolution_chain(self, table, pokemon_form, force_refresh = False): + def parse_evolution_chain(self, table, pokemon_form, force_refresh = True): cache_record_name = f"evo_{pokemon_form['pfic']}" if force_refresh: cache.purge(cache_record_name) diff --git a/ui/workers/gather_home_storage_status_worker.py b/ui/workers/gather_home_storage_status_worker.py index e63846a..7384a47 100644 --- a/ui/workers/gather_home_storage_status_worker.py +++ b/ui/workers/gather_home_storage_status_worker.py @@ -104,7 +104,7 @@ class GatherHomeStorageStatus(QRunnable): if cached_entry is not None: return cached_entry - url = f"{self.base_url}{region}pokemon.shtml" + url = f"{self.base_url}{region.lower()}pokemon.shtml" response = cache.fetch_url(url) if not response: return [] diff --git a/ui/workers/gather_pokemon_forms_worker.py b/ui/workers/gather_pokemon_forms_worker.py index e41fec6..fe2d690 100644 --- a/ui/workers/gather_pokemon_forms_worker.py +++ b/ui/workers/gather_pokemon_forms_worker.py @@ -4,6 +4,7 @@ import re from cache import cache from utility.functions import get_generation_from_national_dex, sanitise_pokemon_name_for_url, remove_accents, compare_pokemon_forms, find_game_generation, format_pokemon_id +from utility.pokemon_word_ninja import PokemonWordNinja class GatherPokemonFormsWorkerSignals(QObject): finished = pyqtSignal(list) @@ -12,6 +13,7 @@ class GatherPokemonFormsWorker(QRunnable): def __init__(self): super().__init__() self.signals = GatherPokemonFormsWorkerSignals() + self.splitter = PokemonWordNinja() def run(self): try: @@ -56,6 +58,7 @@ class GatherPokemonFormsWorker(QRunnable): for small in smalls: form_name += small.get_text(strip=True) + " " form_name = form_name.strip() + form_name = self.splitter.split(form_name) return form_name return "None" @@ -63,6 +66,7 @@ class GatherPokemonFormsWorker(QRunnable): found_forms = [] generation = get_generation_from_national_dex(national_dex_number) pokemon_name = pokemon_soup.get_text(strip=True) + self.splitter.add_custom_word(pokemon_name) print(f"Processing {pokemon_name}") url_name = sanitise_pokemon_name_for_url(pokemon_name) diff --git a/utility/data.py b/utility/data.py index f15e14c..c289958 100644 --- a/utility/data.py +++ b/utility/data.py @@ -10,8 +10,8 @@ pokemon_generations = { 9: {"min": 906, "max": 1025}, } -regions = ["kanto", "johto", "hoenn", "sinnoh", "unova", "kalos", "alola", "galar", "paldea", "hisui", "unknown"] -regional_descriptors = ["kantonian", "johtonian", "hoennian", "sinnohan", "unovan", "kalosian", "alolan", "galarian", "hisuian", "paldean"] +regions = ["Kanto", "Johto", "Hoenn", "Sinnoh", "Unova", "Kalos", "Alola", "Galar", "Paldea", "Hisui", "Unknown"] +regional_descriptors = ["Kantonian", "Johtonian", "Hoennian", "Sinnohan", "Unovan", "Kalosian", "Alolan", "Galarian", "Hisuian", "Paldean"] yellow = { "Name": "Yellow", @@ -308,4 +308,14 @@ non_evolution_forms = [ "Mega", "Dynamax", "Gigantamax" -] \ No newline at end of file +] + +POKEMON_PROPER_NOUNS = { + "Augurite", + "Electirizer", + "Magmarizer", + "Gigantamax" +} + +POKEMON_PROPER_NOUNS = POKEMON_PROPER_NOUNS | set(regions) +POKEMON_PROPER_NOUNS = POKEMON_PROPER_NOUNS | set(regional_descriptors) \ No newline at end of file diff --git a/utility/pokemon_word_ninja.py b/utility/pokemon_word_ninja.py new file mode 100644 index 0000000..2ff87f4 --- /dev/null +++ b/utility/pokemon_word_ninja.py @@ -0,0 +1,54 @@ +import wordninja +import re +from typing import List +from utility.data import POKEMON_PROPER_NOUNS + +class PokemonWordNinja: + def __init__(self, custom_word_list: List[str] = None): + custom_words = POKEMON_PROPER_NOUNS + if custom_word_list: + custom_words = custom_words | set(custom_word_list) + + self.custom_words = [] + self.placeholder_map = {} + self.word_to_placeholder_map = {} + if custom_words: + # Store custom words with original capitalization, sorted by length + self.custom_words = sorted(custom_words, key=len, reverse=True) + for word in self.custom_words: + # Generate a unique placeholder + placeholder = f"__PLACEHOLDER_{hash(word)}__" + self.placeholder_map[placeholder] = word + self.word_to_placeholder_map[word] = placeholder + + def add_custom_word(self, word: str): + words = self.custom_words + words.append(word) + self.custom_words = sorted(words, key=len, reverse=True) + placeholder = f"__PLACEHOLDER_{hash(word)}__" + self.placeholder_map[placeholder] = word + self.word_to_placeholder_map[word] = placeholder + + def split(self, text: str) -> str: + working_text = text + + # First handle exact custom words to preserve capitalization + for word in self.custom_words: + placeholder = self.word_to_placeholder_map[word] + pattern = re.compile(re.escape(word), re.IGNORECASE) + working_text = pattern.sub(placeholder, working_text) + + # Clean up spaces + working_text = ' '.join(working_text.split()) + + # For remaining text, use wordninja + parts = [] + for part in working_text.split(): + if part in self.placeholder_map: + # Replace placeholder with the original word + parts.append(self.placeholder_map[part]) + else: + split_parts = wordninja.split(part) + parts.extend(split_parts) + + return ' '.join(parts) \ No newline at end of file