diff --git a/DataGatherers/DetermineOriginGame.py b/DataGatherers/DetermineOriginGame.py index 03f5abd..29daadf 100644 --- a/DataGatherers/DetermineOriginGame.py +++ b/DataGatherers/DetermineOriginGame.py @@ -386,19 +386,27 @@ def split_td_contents(td): def parse_form_information(html_content): soup = BeautifulSoup(html_content, 'html.parser') - form_info = soup.find('small') - - if form_info: - form_text = form_info.get_text(strip=True) - # Remove parentheses - form_text = form_text.strip('()') - - # Split the text into main form and breed (if present) - parts = form_text.split('(') - main_form = parts[0].strip() - breed = parts[1].strip(')') if len(parts) > 1 else None - - return main_form, breed + small_tag = soup.find('small') + + # Form info is in bold inside a small tag. + if small_tag: + bold_tag = small_tag.find('b') + if bold_tag: + form_text = bold_tag.get_text(strip=True) + # Remove parentheses + form_text = form_text.strip('()') + + # Split the text into main form and breed (if present) + parts = form_text.split('(') + main_form = parts[0].strip() + + # "Factor"s are not actual forms, they are properties of the pokemon you can encoutner. + if main_form and "factor" in main_form.lower(): + return None, None + + breed = parts[1].strip(')') if len(parts) > 1 else None + + return main_form, breed return None, None @@ -608,15 +616,15 @@ def get_locations_from_bulbapedia(pokemon_name, form, cache: CacheManager): if form is None: for raw_location in raw_locations: raw_text = raw_location.get_text() - raw_text = raw_text.replace(" and ", ",") - locations = raw_text.split(',') - for location in locations: - location = location.strip() - if location == "": - continue - if raw_game not in game_locations: - game_locations[raw_game] = [] - game_locations[raw_game].append(location.strip()) + main_form, sub_form = parse_form_information(str(raw_location)) + if main_form and (main_form != "All Forms" and main_form != "Kantonian Form"): + continue + if raw_game not in game_locations: + game_locations[raw_game] = [] + info = {} + info["location"] = raw_text + info["tag"] = str(raw_location) + game_locations[raw_game].append(info) else: for raw_location in raw_locations: main_form, sub_form = parse_form_information(str(raw_location)) @@ -630,14 +638,13 @@ def get_locations_from_bulbapedia(pokemon_name, form, cache: CacheManager): sub_form_match = False if not sub_form else fuzz.partial_ratio(form.lower(), sub_form.lower()) >= 80 if main_form_match or sub_form_match: - locations = raw_location.get_text().replace(' and ', ',').replace('#', '').split(',') - for location in locations: - location = location.strip() - if location == "": - continue - if raw_game not in game_locations: - game_locations[raw_game] = [] - game_locations[raw_game].append(location.strip()) + raw_text = raw_location.get_text() + if raw_game not in game_locations: + game_locations[raw_game] = [] + info = {} + info["location"] = raw_text + info["tag"] = str(raw_location) + game_locations[raw_game].append(info) # For Later for variant in event_tables: @@ -655,6 +662,9 @@ def get_locations_from_bulbapedia(pokemon_name, form, cache: CacheManager): return game_locations +def split_outside_brackets(str): + return re.split(r',(?![^()]*\))', str) + def handle_unown(pokemon, encounter_data): if not pokemon.name == "Unown": return diff --git a/DataGatherers/update_location_information.py b/DataGatherers/update_location_information.py index 42779da..0b8fbc2 100644 --- a/DataGatherers/update_location_information.py +++ b/DataGatherers/update_location_information.py @@ -1,6 +1,8 @@ import sqlite3 from cache_manager import CacheManager from DetermineOriginGame import get_locations_from_bulbapedia +from bs4 import BeautifulSoup, Tag +import re def create_encounters_table(): conn = sqlite3.connect('pokemon_forms.db') @@ -14,6 +16,106 @@ def create_encounters_table(): #conn.commit() return conn +def extract_routes(s): + # Find all route numbers, including those after "and" or separated by commas + route_pattern = r'Routes?\s+((?:\d+(?:,?\s+(?:and\s+)?)?)+)' + route_match = re.search(route_pattern, s, re.IGNORECASE) + + if route_match: + # Extract all numbers from the matched group + numbers = re.findall(r'\d+', route_match.group(1)) + + # Remove the extracted part from the original string + remaining = s[:route_match.start()] + s[route_match.end():].lstrip(', ') + + return numbers, remaining + else: + return [], s + +days = ["Mo", "Tu", "We", "Th", "Fr", "Sa", "Su"] +times = ["Morning", "Day", "Night"] + +all_games = [ + "Yellow", "Red", "Blue", + "Crystal", "Gold", "Silver", + "Emerald", "FireRed", "LeafGreen", "Ruby", "Sapphire", + "Platinum", "HeartGold", "SoulSilver", "Diamond", "Pearl", + "Black 2", "White 2", "Black", "White", + "X", "Y", "Omega Ruby", "Alpha Sapphire", + "Ultra Sun", "Ultra Moon", "Sun", "Moon", + "Sword", "Shield", "Expansion Pass", + "Brilliant Diamond", "Shining Pearl", + "Legends: Arceus", + "Scarlet", "Violet", "The Teal Mask", "The Hidden Treasure of Area Zero", "The Hidden Treasure of Area Zero (Scarlet)", "The Hidden Treasure of Area Zero (Violet)", "The Teal Mask (Scarlet)", "The Teal Mask (Violet)", + "Unknown", + "Pokémon Home", + "Pokémon Go", +] + +def find_match(search_string, string_array): + return next((item for item in string_array if search_string in item), None) + +def find_all_matches_from_array(string, array): + return [item for item in array if item in string] + +def extract_bracketed_text(string): + # This pattern matches text within parentheses, including nested parentheses + pattern = r'\((?:[^()]*|\([^()]*\))*\)' + + # Find all matches + matches = re.findall(pattern, string) + + # Remove the outer parentheses from each match + return [match[1:-1] for match in matches] + +def extract_additional_information(s): + soup = BeautifulSoup(s, 'html.parser') + full_text = soup.get_text(strip=True) + sup_tags = soup.find_all('sup') + sup_text = None + + details = {} + details["days"] = [] + details["times"] = [] + details["dual_slot"] = None + details["only_one"] = False + details["static_encounter"] = False + details["only_two"] = False + details["extra_text"] = None + details["stars"] = None + + for sup_tag in sup_tags: + sup_text = sup_tag.get_text(strip=True) + + if find_match(sup_text, days): + details["days"].append(sup_text) + + if find_match(sup_text, times): + details["times"].append(sup_text) + + bracket_text = extract_bracketed_text(full_text) + if len(bracket_text) > 0: + if bracket_text[0] in all_games: + details["dual_slot"] = bracket_text[0] + + if "Only One" in bracket_text: + details["only_one"] = True + details["static_encounter"] = True + + if "Only Two" in bracket_text: + details["only_two"] = True + details["static_encounter"] = True + + if "★" in bracket_text: + details["stars"] = bracket_text + + details["extra_text"] = " ".join(bracket_text) + + if sup_text: + return full_text.replace(sup_text, ""), details + else: + return full_text, details + if __name__ == "__main__": cache = CacheManager() @@ -33,16 +135,35 @@ if __name__ == "__main__": gender = form form = None - encounters_we_aren_t_interested_in = ["Trade", "Time Capsule", "Unobtainable"] + encounters_to_ignore = ["trade", "time capsule", "unobtainable", "evolve", "tradeversion", "poké transfer", "friend safari"] encounter_data = get_locations_from_bulbapedia(name, form, cache) for encounter in encounter_data: - print(f"Found in {encounter}:") + if len(encounter_data[encounter]) == 0: + continue + + print_encounter = True + for location in encounter_data[encounter]: - if location in encounters_we_aren_t_interested_in: + if location == "": continue - if "Evolve" in location: + test_location = location["location"].strip().lower() + + ignore_location = False + for ignore in encounters_to_ignore: + if ignore in test_location: + ignore_location = True + break + + if ignore_location: continue - if "TradeVersion" in location: - continue - print(f" {location}") \ No newline at end of file + + if print_encounter: + print(f"Found in {encounter}:") + print_encounter = False + + routes, remaining = extract_routes(location["location"].strip()) + print(f"Routes: {routes}") + remaining_locations, details = extract_additional_information(location["tag"]) + print(f"Remaining: {remaining_locations}") + print(f"Details: {details}")