OriginDex-DataManager/ui/workers/gather_encounter_locations.py


								from PyQt6.QtCore import QObject, pyqtSignal, QRunnable

								from bs4 import BeautifulSoup, NavigableString

								from pattern.en import singularize

								from fuzzywuzzy import fuzz

								import re


								from cache import cache

								from db import db


								from utility.data import default_forms, regional_descriptors, days, times, rods

								from utility.functions import is_mainline_game, compare_pokemon_forms, find_match_in_string_array, extract_bracketed_text

								from utility.pokemon_word_ninja import PokemonWordNinja


								class GatherEncountersWorkerSignals(QObject):

								    finished = pyqtSignal(list)


								class GatherEncountersWorker(QRunnable):

								    def __init__(self):

								        super().__init__()

								        self.signals = GatherEncountersWorkerSignals()

								        self.default_forms_set = set(default_forms)

								        self.splitter = PokemonWordNinja()

								        self.encounters_to_ignore = [

								            "trade",

								            "time capsule",

								            "unobtainable",

								            "tradeversion",

								            "poké transfer",

								            "friend safari",

								            "unavailable",

								            "pokémon home",

								            "union circle",

								            "pokémon bank",

								            "pal park",

								            "transfer from dream radar",

								            "global link event",

								            "pokémon channel",

								            "pokémon colosseum bonus disc"

								        ]

								        self.encounters = []


								    def run(self):

								        try:

								            gathered_data = self.gather_encounter_data()

								            self.signals.finished.emit(gathered_data)

								        except Exception as e:

								            print(f"Error gathering Pokémon forms: {e}")


								    def gather_encounter_data(self):

								        all_pokemon_forms = db.get_list_of_pokemon_forms()


								        for form_entry in all_pokemon_forms:

								            form = form_entry["form_name"]

								            name = form_entry["name"]

								            pfic = form_entry["pfic"]


								            print(f'Processing {name}')

								            self.splitter.add_custom_word(name)


								            if form and name in form:

								                form = form.replace(name, "").strip()


								            if form and form.startswith("Female"):

								                form = form.replace("Female", "").strip()


								            if form and form.startswith("Male"):

								                form = form.replace("Male", "").strip()


								            if form and form in default_forms:

								                form = None


								            if name == "Unown" and (form != "!" and form != "?"):

								                form = None


								            if name == "Tauros" and form == "Combat Breed":

								                form = "Paldean Form"


								            if name == "Alcremie":

								                form = None


								            if name == "Minior":

								                form = None


								            if name.lower() == "ho-oh":

								                name = "Ho-Oh"


								            if form == "":

								                form = None


								            search_form = form


								            encounter_data = self.get_locations_from_bulbapedia(name, search_form)

								            if encounter_data == None:

								                continue


								            for encounter in encounter_data:

								                if len(encounter_data[encounter]) == 0:

								                    break


								                for location in encounter_data[encounter]:

								                    if location == "":

								                        continue

								                    test_location = location["location"].strip().lower()

								                    test_location_text = BeautifulSoup(test_location, 'html.parser').get_text().lower()

								                    if "evolve" in test_location_text:

								                        remaining, details = self.extract_additional_information(location["tag"])

								                        evolve_info = self.extract_evolve_information(remaining, form_entry["form_name"])


								                        if evolve_info:

								                            #logger.info(f"Evolve Info: {evolve_info}")

								                            self.save_evolve_encounter(pfic, encounter, details["days"], details["times"], evolve_info["evolve_from"])

								                    elif "event" in test_location_text:

								                        #logger.info(f"Event: {location['location']}")

								                        self.save_event_encounter(pfic, encounter)

								                    else:

								                        remaining, details = self.extract_additional_information(location["tag"])

								                        routes, remaining = self.extract_routes(remaining)

								                        #logger.info(f"Routes: {routes}")

								                        #logger.info(f"Remaining: {remaining.strip()}")

								                        #logger.info(f"Details: {details}")


								                        if len(details["times"]) > 0:

								                            #logger.info("Stupid Data")

								                            pass


								                        for route in routes:

								                            route_name = f"Route {route}"

								                            self.save_encounter(pfic, encounter, route_name, details["days"], details["times"], details["dual_slot"], details["static_encounter"], details["static_encounter_count"], details["extra_text"], details["stars"], details["Rods"], details["Fishing"], details["starter"] )


								                        if remaining != "":

								                            remaining_locations = remaining.replace(" and ", ",").split(",")

								                            for remaining_location in remaining_locations:

								                                if remaining_location.strip() == "":

								                                    continue


								                                ignore_location = False

								                                for ignore in self.encounters_to_ignore:

								                                    if ignore in remaining_location.lower():

								                                        ignore_location = True

								                                        break


								                                if ignore_location:

								                                    continue


								                                self.save_encounter(pfic, encounter, remaining_location.strip(), details["days"], details["times"], details["dual_slot"], details["static_encounter"], details["static_encounter_count"], details["extra_text"], details["stars"], details["Rods"], details["Fishing"], details["starter"]  )


								        return self.encounters


								    def get_locations_from_bulbapedia(self, pokemon_name, form, force_refresh = False):

								        url = f"https://bulbapedia.bulbagarden.net/wiki/{pokemon_name}_(Pokémon)"

								        page_data = cache.fetch_url(url)

								        if not page_data:

								            return None


								        cache_key = f'locations_{url}_data_{form}'


								        if force_refresh:

								            cache.purge(cache_key)


								        cached_entry = cache.get(cache_key)

								        if cached_entry != None:

								            return cached_entry


								        soup = BeautifulSoup(page_data, 'html.parser')

								        if not soup:

								            return None


								        # Try different methods to find the locations table

								        locations_table = None

								        possible_headers = ['Game locations', 'In side games', 'In spin-off games']


								        for header in possible_headers:

								            span = soup.find('span', id=header.replace(' ', '_'))

								            if span:

								                locations_table = span.find_next('table', class_='roundy')

								                if locations_table:

								                    break


								        if not locations_table:

								            print(f"Warning: Couldn't find locations table for {pokemon_name}")

								            return None


								        raw_game_locations = {}


								        generation_tbody = locations_table.find('tbody', recursive=False)

								        generation_rows = generation_tbody.find_all('tr', recursive=False)

								        for generation_row in generation_rows:

								            random_nested_td = generation_row.find('td', recursive=False)

								            if not random_nested_td:

								                continue

								            random_nested_table = random_nested_td.find('table', recursive=False)

								            if not random_nested_table:

								                continue

								            random_nested_tbody = random_nested_table.find('tbody', recursive=False)

								            random_nested_rows = random_nested_tbody.find_all('tr', recursive=False)


								            for nested_row in random_nested_rows:

								                if 'Generation' in nested_row.get_text(strip=True):

								                    continue


								                games_container_td = nested_row.find('td', recursive=False)

								                if not games_container_td:

								                    continue

								                games_container_table = games_container_td.find('table', recursive=False)

								                if not games_container_table:

								                    continue

								                games_container_tbody = games_container_table.find('tbody', recursive=False)

								                games_container_rows = games_container_tbody.find_all('tr', recursive=False)

								                for games_container_row in games_container_rows:

								                    games = games_container_row.find_all('th')

								                    for game in games:

								                        raw_game = game.get_text(strip=True)

								                        if is_mainline_game(raw_game) == None:

								                            continue

								                        locations_container_td = games_container_row.find('td', recursive=False)

								                        if not locations_container_td:

								                            continue

								                        locations_container_table = locations_container_td.find('table', recursive=False)

								                        if not locations_container_table:

								                            continue

								                        locations_container_tbody = locations_container_table.find('tbody', recursive=False)

								                        locations = locations_container_tbody.find_all('td')

								                        for location in locations:

								                            groups = self.split_td_contents(location)

								                            for group in groups:

								                                if raw_game not in raw_game_locations:

								                                    raw_game_locations[raw_game] = []

								                                raw_game_locations[raw_game].append(group)


								        # Process events

								        events_section = soup.find('span', id='In_events')

								        event_tables = self.process_event_tables(events_section) if events_section else {}


								        game_locations = {}

								        for raw_game, raw_locations in raw_game_locations.items():

								            encounters = self.process_game_locations(raw_game, raw_locations, form)

								            if encounters and len(encounters) > 0:

								                game_locations[raw_game] = encounters


								        # Process event tables

								        for variant in event_tables:

								            if (variant == pokemon_name and form is None) or (form and form in variant):

								                self.process_event_table(event_tables[variant], game_locations)


								        cache.set(cache_key, game_locations)

								        return game_locations


								    def split_td_contents(self, td):

								        groups = []

								        current_group = []

								        for content in td.contents:

								            if isinstance(content, NavigableString):

								                text = content.strip()

								                if text:

								                    current_group.append(content)

								            elif content.name == 'br':

								                if current_group:

								                    groups.append(''.join(str(item) for item in current_group))

								                    current_group = []

								            else:

								                current_group.append(content)

								        if current_group:

								            groups.append(''.join(str(item) for item in current_group))

								        return groups


								    def process_game_locations(self, raw_game, raw_locations, form):

								        locations = []


								        for raw_location in raw_locations:

								            raw_text = raw_location

								            forms = self.parse_form_information(raw_location)

								            if form is None:

								                if len(forms) > 0:

								                    for form_info in forms:

								                        main_form = form_info["main_form"]

								                        if default_forms and main_form and main_form in self.default_forms_set:

								                            main_form = None


								                        if main_form and (main_form != "All Forms" and main_form != "Kantonian Form" and main_form != "All Sizes"):

								                            continue


								                        locations.append({"location": raw_text, "tag": raw_location})

								                else:

								                    locations.append({"location": raw_text, "tag": raw_location})

								            elif len(forms) > 0:

								                for form_info in forms:

								                    if self.form_matches(form_info, form, default_forms):

								                        locations.append({"location": raw_text, "tag": raw_location})

								            else:

								                form_info = {"main_form": None, "sub_form": None, "region": None}

								                if self.form_matches(form_info, form, default_forms):

								                    locations.append({"location": raw_text, "tag": raw_location})


								        return locations if locations else None


								    def process_event_tables(self, events_section):

								        event_tables = {}

								        if events_section:

								            next_element = events_section.parent.find_next_sibling()

								            while next_element and next_element.name != 'h3':

								                if next_element.name == 'h5':

								                    variant = next_element.text.strip()

								                    table = next_element.find_next_sibling('table', class_='roundy')

								                    if table:

								                        event_tables[variant] = table

								                next_element = next_element.find_next_sibling()

								        return event_tables


								    def parse_form_information(self, html_content):

								        soup = BeautifulSoup(html_content, 'html.parser')


								        #TODO: This wont work for lines that have several small blocks in one line.

								        #TODO: Adjust this to handle more than one small block, see Basculin for example

								        small_tag = soup.find('small')


								        forms = []

								        # Form info is in bold inside a small tag.

								        if small_tag:

								            bold_tags = small_tag.find_all('b')

								            for bold_tag in bold_tags:

								                form_text = bold_tag.get_text(strip=True)


								                # Remove parentheses

								                form_text = form_text.strip('()')


								                if "/" in form_text:

								                    last_word = form_text.split()[-1]

								                    form_text = form_text.replace(last_word, "").strip()

								                    parts = form_text.split('/')

								                    for part in parts:

								                        main_form = part.strip() + " " + singularize(last_word)

								                        info = {

								                            "main_form": main_form,

								                            "sub_form": None

								                        }

								                        forms.append(info)

								                    continue


								                # Split the text into main form and breed (if present)

								                parts = form_text.split('(')

								                main_form = parts[0].strip()


								                # "Factor"s are not actual forms, they are properties of the pokemon you can encoutner.

								                if main_form and "factor" in main_form.lower():

								                    continue


								                breed = parts[1].strip(')') if len(parts) > 1 else None


								                info = {

								                    "main_form": main_form,

								                    "sub_form": breed

								                }


								                for region in regional_descriptors:

								                    if region in main_form.lower():

								                        info["region"] = region

								                        break


								                forms.append(info)

								        else: #..... Gimmighoul

								            headings = soup.find_all('b')

								            if len(headings) > 0:

								                for heading in headings:

								                    if heading.parent.name == 'sup':

								                        continue

								                    if "form" not in heading.get_text(strip=True).lower():

								                        continue

								                    main_form = heading.get_text(strip=True)

								                    info = {

								                        "main_form": main_form,

								                        "sub_form": None

								                    }


								                    for region in regional_descriptors:

								                        if region in main_form.lower():

								                            info["region"] = region

								                            break


								                    forms.append(info)


								        return forms


								    def form_matches(self, form_info, form, default_forms):

								        main_form = form_info["main_form"]

								        sub_form = form_info["sub_form"]

								        try:

								            region = form_info['region'] if 'region' in form_info else None

								        except KeyError:

								            region = None


								        if default_forms and main_form and main_form in default_forms:

								            main_form = None


								        if form.lower() in ["spring form", "summer form", "autumn form", "winter form"] and main_form == None:

								            return True


								        if form and main_form is None:

								            return False


								        if main_form in ["All Forms", "All Sizes"]:

								            return True


								        if region == None and main_form in ["Kantonian Form"]:

								            return True


								        main_form_match = compare_pokemon_forms(form, main_form) or fuzz.partial_ratio(form.lower(), main_form.lower()) >= 95

								        sub_form_match = compare_pokemon_forms(form, sub_form) or (sub_form and fuzz.partial_ratio(form.lower(), sub_form.lower()) >= 95)


								        if not main_form_match and not sub_form_match and region:

								            region_match = compare_pokemon_forms(form, region) or fuzz.partial_ratio(form.lower(), region.lower()) >= 95

								            return region_match


								        return main_form_match or sub_form_match


								    def extract_routes(self, s):

								        # Find all route numbers, including those after "and" or separated by commas

								        route_pattern = r'Routes?\s?((?:\d+(?:,?\s+(?:and\s+)?)?)+)'

								        route_match = re.search(route_pattern, s, re.IGNORECASE)


								        if route_match:

								            # Extract all numbers from the matched group

								            numbers = re.findall(r'\d+', route_match.group(1))


								            # Remove the extracted part from the original string

								            remaining = s[:route_match.start()] + s[route_match.end():].lstrip(', ')


								            return numbers, remaining

								        else:

								            return [], s


								    def extract_additional_information(self, s):

								        details = {}

								        details["days"] = []

								        details["times"] = []

								        details["dual_slot"] = None

								        details["static_encounter_count"] = 0

								        details["static_encounter"] = False

								        details["starter"] = False

								        details["extra_text"] = []

								        details["stars"] = []

								        details["Fishing"] = False

								        details["Rods"] = []


								        if s is None:

								            return "", details


								        soup = BeautifulSoup(s, 'html.parser')

								        full_text = soup.get_text()

								        sup_tags = soup.find_all('sup')

								        sup_text = []


								        if "first partner" in full_text.lower():

								            details["starter"] = True


								        for sup_tag in sup_tags:

								            text = sup_tag.get_text(strip=True)


								            if find_match_in_string_array(text, days):

								                details["days"].append(text)

								                sup_text.append(text)


								            if find_match_in_string_array(text, times):

								                details["times"].append(text)

								                sup_text.append(text)


								        bracket_text = extract_bracketed_text(full_text)


								        for text in bracket_text:

								            text = text.strip()

								            text_lower = text.lower()


								            game = is_mainline_game(text_lower)

								            if game != None:

								                details["dual_slot"] = game["Name"]

								                text = re.sub(game["Name"], '', text_lower, flags=re.IGNORECASE)


								            match = find_match_in_string_array(text_lower, days)

								            if match:

								                details["days"].append(match)

								                text = re.sub(match, '', text_lower, flags=re.IGNORECASE)


								            match = find_match_in_string_array(text_lower, times)

								            if match:

								                details["times"].append(match)

								                text = re.sub(match, '', text_lower, flags=re.IGNORECASE)


								            if "only one" in text_lower:

								                details["static_encounter_count"] = 1

								                details["static_encounter"] = True

								                text = re.sub(r'only one', '', text_lower, flags=re.IGNORECASE)

								            elif "only two" in text_lower:

								                details["static_encounter_count"] = 2

								                details["static_encounter"] = True

								                text = re.sub(r'only two', '', text_lower, flags=re.IGNORECASE)


								            if "rod" in text_lower:

								                match = find_match_in_string_array(text_lower, rods)

								                if match:

								                    details["Fishing"] = True

								                    details["Rods"].append(match)

								                    text = re.sub(match, '', text_lower, flags=re.IGNORECASE)


								            if "★" in text:

								                star_parts = re.findall(r'\d★,*', text)

								                for part in star_parts:

								                    details["stars"].append(part.replace(',', '').strip())

								                text = re.sub(r'\d★,*', '', text)


								            if text.strip() != "":

								                details["extra_text"].append(text.strip())

								                sup_text.append(text.strip())


								        if len(sup_text) > 0:

								            for text in sup_text:

								                full_text = full_text.replace(text, "")


								        if len(bracket_text) > 0:

								            for text in bracket_text:

								                full_text = full_text.replace(text, "")

								            full_text = full_text.replace('(', "").replace(')', "")


								            return full_text.strip(), details

								        else:

								            return full_text, details


								    def extract_evolve_information(self, s: str, search_form):

								        details = {}

								        if s is None or s == "":

								            return details


								        s = s.replace("Evolve", "")


								        parts = s.split(" ")


								        if len(parts) >= 1:

								            target_pokemon = parts[0].strip()


								            form = None

								            if "♀" in target_pokemon:

								                target_pokemon = target_pokemon.replace("♀", "").strip()

								                form = "Female"


								            if "♂" in target_pokemon:

								                target_pokemon = target_pokemon.replace("♂", "").strip()

								                form = "Male"


								            results = db.get_pokemon_details_by_name(target_pokemon)


								            if results:

								                for result in results:

								                    if compare_pokemon_forms(result["form_name"], form):

								                        details["evolve_from"] = result["pfic"]

								                        break


								            if results and "evolve_from" not in details:

								                for result in results:

								                    if compare_pokemon_forms(result["form_name"], search_form if search_form != form else None):

								                        details["evolve_from"] = result["pfic"]

								                        break


								            if search_form and results and "evolve_from" not in details:

								                if "female" in search_form.lower():

								                    form = "Female"

								                elif "male" in search_form.lower():

								                    form = "Male"


								                if form:

								                    for result in results:

								                        if compare_pokemon_forms(result["form_name"], form):

								                            details["evolve_from"] = result["pfic"]

								                            break


								        return details


								    def save_evolve_encounter(self, pfic, game, days, times, from_pfic):

								        game_id = db.get_game_id_by_name(game)


								        encounter = {

								            "pfic": pfic,

								            "game_id": game_id,

								            "type": "evolve",

								            "data": {

								                "day": None,

								                "time": None,

								                "from_pfic": from_pfic,

								            }

								        }


								        if len(days) > 0:

								            for day in days:

								                encounter["data"]["day"] = day

								                encounter["data"]["time"] = None

								                self.encounters.append(encounter)


								        elif len(times) > 0:

								            for time in times:

								                encounter["data"]["day"] = None

								                encounter["data"]["time"] = time

								                self.encounters.append(encounter)

								        else:

								            encounter["data"]["day"] = None

								            encounter["data"]["time"] = None

								            self.encounters.append(encounter)


								    def save_event_encounter(self, pfic, game):

								        game_id = db.get_game_id_by_name(game)


								        encounter = {

								            "pfic": pfic,

								            "game_id": game_id,

								            "type": "event"

								        }


								        self.encounters.append(encounter)


								    def save_encounter(self, pfic, game, location, days, times, dual_slot, static_encounter, static_encounter_count, extra_text, stars, rods, fishing, starter):

								        game_id = db.get_game_id_by_name(game)

								        extra_text_str = ' '.join(extra_text) if extra_text else None

								        stars_str = ','.join(sorted(stars)) if stars else None

								        rods_str = ','.join(sorted(rods)) if rods else None


								        encounter_type = "random"


								        if starter:

								            encounter_type = "starter"


								        if static_encounter:

								            encounter_type = "static"


								        encounter = {

								            "pfic": pfic,

								            "game_id": game_id,

								            "type": encounter_type,

								            "data": {

								                "location": location,

								                "day": None,

								                "time": None,

								                "dual_slot": dual_slot,

								                "extra_text": extra_text_str,

								                "stars": stars_str,

								                "rods": rods_str,

								                "fishing": fishing

								            }

								        }


								        if static_encounter:

								            encounter["data"]["static_encounter_count"] = static_encounter_count


								        if len(days) > 0:

								            for day in days:

								                encounter["data"]["day"] = day

								                encounter["data"]["time"] = None

								                self.encounters.append(encounter)


								        elif len(times) > 0:

								            for time in times:

								                encounter["data"]["day"] = None

								                encounter["data"]["time"] = time

								                self.encounters.append(encounter)


								        else:

								            encounter["data"]["day"] = None

								            encounter["data"]["time"] = None

								            self.encounters.append(encounter)


								    def process_event_tables(self, events_section):

								        event_tables = {}

								        if events_section:

								            next_element = events_section.parent.find_next_sibling()

								            while next_element and next_element.name != 'h3':

								                if next_element.name == 'h5':

								                    variant = next_element.text.strip()

								                    table = next_element.find_next_sibling('table', class_='roundy')

								                    if table:

								                        event_tables[variant] = table

								                next_element = next_element.find_next_sibling()

								        return event_tables


								    def process_event_table(self, table, game_locations):

								        for row in table.find_all('tr')[1:]:  # Skip header row

								            cells = row.find_all('td')

								            if len(cells) >= 6:  # Ensure all required columns are present

								                # Extract game names as a list

								                game_links = cells[0].find_all('a')

								                individual_games = []


								                for link in game_links:

								                    # Replace specific known prefixes

								                    game_name = link['title'].replace("Pokémon ", "").replace("Versions", "").replace(" Version", "").replace(" (Japanese)", "")


								                    # Split on " and ", which is used for combined games

								                    parsed_names = game_name.split(" and ")


								                    # Add the parsed names to the list

								                    individual_games.extend(parsed_names)


								                # Print extracted game names for debugging

								                print(f"Extracted game names from row: {individual_games}")


								                # Filter games to include only those in all_games

								                matching_games = []


								                for game in individual_games:

								                    match = is_mainline_game(game)

								                    if match:

								                        matching_games.append(game)


								                # Print matching games for debugging

								                print(f"Matching games after filtering: {matching_games}")


								                if matching_games:

								                    location = cells[2].text.strip()

								                    distribution_period = cells[5].text.strip()

								                    for game in matching_games:

								                        if game not in game_locations:

								                            game_locations[game] = []

								                        game_locations[game].append({

								                            "location": f"Event: {location}",

								                            "tag": str(cells[2])

								                        })