From 850303826e9bc2cc6e06a29bbe0965ee8c546bb5 Mon Sep 17 00:00:00 2001 From: Dan Date: Mon, 30 Sep 2024 16:07:46 +0100 Subject: [PATCH] - Large changes to the way i get pokemon data --- Utilities/DetermineOriginGame.py | 302 ++++++++++++++++++++----------- 1 file changed, 200 insertions(+), 102 deletions(-) diff --git a/Utilities/DetermineOriginGame.py b/Utilities/DetermineOriginGame.py index 8fde465..7a5fd88 100644 --- a/Utilities/DetermineOriginGame.py +++ b/Utilities/DetermineOriginGame.py @@ -5,7 +5,8 @@ import json import os import re import sqlite3 -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag, NavigableString +import copy # Initialize the database connection conn = sqlite3.connect('pokemon_cache.db') @@ -26,13 +27,13 @@ all_games = [ "Crystal", "Gold", "Silver", "Emerald", "FireRed", "LeafGreen", "Ruby", "Sapphire", "Platinum", "HeartGold", "SoulSilver", "Diamond", "Pearl", - "Black-2", "White-2", "Black", "White", - "X", "Y", "Omega-Ruby", "Alpha-Sapphire", - "Ultra-Sun", "Ultra-Moon", "Sun", "Moon", - "Sword", "Shield", - "Brilliant-Diamond", "Shining-Pearl", - "Legends-Arceus", - "Scarlet", "Violet", + "Black 2", "White 2", "Black", "White", + "X", "Y", "Omega Ruby", "Alpha Sapphire", + "Ultra Sun", "Ultra Moon", "Sun", "Moon", + "Sword", "Shield", "Expansion Pass", + "Brilliant Diamond", "Shining Pearl", + "Legends: Arceus", + "Scarlet", "Violet", "The Teal Mask", "The Hidden Treasure of Area Zero", "Unknown" ] @@ -59,7 +60,7 @@ def update_cache(key, value): if key not in cache: cache[key] = value new_entries_count += 1 - if new_entries_count >= 10: + if new_entries_count >= 1: save_cached_data() time.sleep(1) @@ -233,21 +234,193 @@ def get_pokemon_encounter_data(pokemon_name, form, cache): return data else: return None + +def split_td_contents(td): + groups = [] + current_group = [] + + for content in td.contents: + if isinstance(content, Tag) and content.name == 'br': + if current_group: + groups.append(BeautifulSoup('', 'html.parser').new_tag('div')) + for item in current_group: + groups[-1].append(copy.copy(item)) + current_group = [] + else: + current_group.append(content) + + if current_group: + groups.append(BeautifulSoup('', 'html.parser').new_tag('div')) + for item in current_group: + groups[-1].append(copy.copy(item)) + + return groups + +def parse_form_information(html_content): + soup = BeautifulSoup(html_content, 'html.parser') + form_info = soup.find('small') + + if form_info: + form_text = form_info.get_text(strip=True) + # Remove parentheses + form_text = form_text.strip('()') + + # Split the text into main form and breed (if present) + parts = form_text.split('(') + main_form = parts[0].strip() + breed = parts[1].strip(')') if len(parts) > 1 else None + + return main_form, breed + + return None, None + +def get_locations_from_bulbapedia(pokemon_name, form, cache): + page_data = get_pokemon_data_bulbapedia(pokemon_name, cache) + if not page_data: + return None + + soup = BeautifulSoup(page_data, 'html.parser') + + locations_section = soup.find('span', id='Game_locations') + if not locations_section: + return None + + locations_table = locations_section.find_next('table', class_='roundy') + if not locations_table: + return None + + raw_game_locations = {} + + # Ok so the table is a bit of a mess. It has some nested tables and stuff. + # In each row is a nested table with all the games in a generation. + # Next is another nexted table, but i can't tell what for. + # within that nested table, is another nested table with the games, either the release pair or a single game spanning two columns. + # Next to that is another nested table with the locations. + + generation_tbody = locations_table.find('tbody', recursive=False) + generation_rows = generation_tbody.find_all('tr', recursive=False) + for generation_row in generation_rows: + random_nested_td = generation_row.find('td', recursive=False) + if not random_nested_td: + continue + random_nested_table = random_nested_td.find('table', recursive=False) + if not random_nested_table: + continue + random_nested_tbody = random_nested_table.find('tbody', recursive=False) + random_nested_rows = random_nested_tbody.find_all('tr', recursive=False) + for nested_row in random_nested_rows: + if 'Generation' in nested_row.get_text(strip=True): + continue + + games_container_td = nested_row.find('td', recursive=False) + if not games_container_td: + continue + games_container_table = games_container_td.find('table', recursive=False) + if not games_container_table: + continue + games_container_tbody = games_container_table.find('tbody', recursive=False) + games_container_rows = games_container_tbody.find_all('tr', recursive=False) + for games_container_row in games_container_rows: + games = games_container_row.find_all('th') + for game in games: + raw_game = game.get_text(strip=True) + if raw_game not in all_games: + continue + locations_container_td = games_container_row.find('td', recursive=False) + if not locations_container_td: + continue + locations_container_table = locations_container_td.find('table', recursive=False) + if not locations_container_table: + continue + locations_container_tbody = locations_container_table.find('tbody', recursive=False) + locations = locations_container_tbody.find_all('td') + for location in locations: + groups = split_td_contents(location) + for group in groups: + if raw_game not in raw_game_locations: + raw_game_locations[raw_game] = [] + raw_game_locations[raw_game].append(group) + + events_section = soup.find('span', id='In_events') + event_tables = {} + if events_section: + event_header = events_section.parent + + variant = "" + for sibling in event_header.find_next_siblings(): + if sibling.name == 'h4': + break + if sibling.name == 'h5': + variant = sibling.get_text(strip=True) + if sibling.name == 'table': + event_tables[variant] = sibling + + game_locations = {} + for raw_game, raw_locations in raw_game_locations.items(): + if form is None: + for raw_location in raw_locations: + locations = raw_location.get_text().split(',') + for location in locations: + if raw_game not in game_locations: + game_locations[raw_game] = [] + game_locations[raw_game].append(location.strip()) + else: + for raw_location in raw_locations: + main_form, sub_form = parse_form_information(str(raw_location)) + if main_form == form: + locations = raw_location.get_text().split(',') + for location in locations: + if raw_game not in game_locations: + game_locations[raw_game] = [] + game_locations[raw_game].append(location.strip()) + + # For Later + for variant in event_tables: + if variant == pokemon_name or (form and form in variant): + games_container_rows = event_tables[variant].find_all('tr') + for game_row in games_container_rows: + entries = game_row.find_all('td') + if len(entries) > 1: + games_string = entries[0].find('a').get('title') + for game in all_games: + if game in games_string: + game_locations[game] = "Event" + + return game_locations -def get_earliest_game(encounter_data): +def get_earliest_game(encounter_data, pokemon_name, form): if not encounter_data: return "Unknown", "Unknown" + non_catchable_methods = ["trade", "event", "global link", "poké transfer", "time capsule", "unobtainable", "pokémon home"] + game_methods = {} - for location_area in encounter_data: - for version_detail in location_area['version_details']: - game = version_detail['version']['name'] - is_gift = any(method['method']['name'] == 'gift' for method in version_detail['encounter_details']) + for game, locations in encounter_data.items(): + for location in locations: + method = "Catchable" + + for non_catchable in non_catchable_methods: + if non_catchable in location.lower(): + method = None + break - if game not in game_methods: - game_methods[game] = "Gift" if is_gift else "Catchable" - elif game_methods[game] == "Gift" and not is_gift: - game_methods[game] = "Catchable" + if method is None: + continue + + if "first partner" in location.lower(): + method = "Starter" + elif "received" in location.lower(): + method = "Gift" + elif "evolve" in location.lower(): + method = "Evolve" + else: + method = "Catchable" + if method: + if game not in game_methods: + game_methods[game.lower()] = method + else: + if method == "Catchable": + game_methods[game.lower()] = method for game in all_games: if game.lower() in game_methods: @@ -257,10 +430,14 @@ def get_earliest_game(encounter_data): def determine_earliest_games(pokemon_list, cache): for pokemon in pokemon_list: - pokemon_data = get_pokemon_data(pokemon['base_name'], pokemon['form'], cache) - encounter_data = get_pokemon_encounter_data(pokemon['base_name'], pokemon['form'], cache) - pokemon['earliest_game'], pokemon['obtain_method'] = get_earliest_game(encounter_data) + print(f"Processing {pokemon['name']} (#{pokemon['number']})") + encounter_data = get_locations_from_bulbapedia(pokemon['base_name'], pokemon['form'], cache) + pokemon['earliest_game'], pokemon['obtain_method'] = get_earliest_game(encounter_data, pokemon['base_name'], pokemon['form']) print(f"Processed {pokemon['name']} (#{pokemon['number']}): {pokemon['earliest_game']} ({pokemon['obtain_method']})") + #pokemon_data = get_pokemon_data(pokemon['base_name'], pokemon['form'], cache) + #encounter_data = get_pokemon_encounter_data(pokemon['base_name'], pokemon['form'], cache) + #pokemon['earliest_game'], pokemon['obtain_method'] = get_earliest_game(encounter_data) + #print(f"Processed {pokemon['name']} (#{pokemon['number']}): {pokemon['earliest_game']} ({pokemon['obtain_method']})") return pokemon_list def get_species_data(pokemon_name, cache): @@ -424,86 +601,7 @@ def is_event_pokemon(pokemon_name, cache): if len(location_tables) == 0 and special_section and event_only in special_section.get_text(strip=True).lower(): return True - return False - -def get_locations_from_bulbapedia(pokemon_name, form, cache): - page_data = get_pokemon_data_bulbapedia(pokemon_name, cache) - if not page_data: - return None - - soup = BeautifulSoup(page_data, 'html.parser') - - locations_section = soup.find('span', id='Game_locations') - if not locations_section: - return None - - locations_table = locations_section.find_next('table', class_='roundy') - if not locations_table: - return None - - game_locations = {} - - # Ok so the table is a bit of a mess. It has some nested tables and stuff. - # In each row is a nested table with all the games in a generation. - # Next is another nexted table, but i can't tell what for. - # within that nested table, is another nested table with the games, either the release pair or a single game spanning two columns. - # Next to that is another nested table with the locations. - - generation_tbody = locations_table.find('tbody', recursive=False) - generation_rows = generation_tbody.find_all('tr', recursive=False) - for generation_row in generation_rows: - random_nested_td = generation_row.find('td', recursive=False) - if not random_nested_td: - continue - random_nested_table = random_nested_td.find('table', recursive=False) - if not random_nested_table: - continue - random_nested_tbody = random_nested_table.find('tbody', recursive=False) - random_nested_rows = random_nested_tbody.find_all('tr', recursive=False) - for nested_row in random_nested_rows: - if 'Generation' in nested_row.get_text(strip=True): - continue - - games_container_td = nested_row.find('td', recursive=False) - if not games_container_td: - continue - games_container_table = games_container_td.find('table', recursive=False) - if not games_container_table: - continue - games_container_tbody = games_container_table.find('tbody', recursive=False) - games_container_rows = games_container_tbody.find_all('tr', recursive=False) - for games_container_row in games_container_rows: - games = games_container_row.find_all('th') - for game in games: - locations_container_td = games_container_row.find('td', recursive=False) - if not locations_container_td: - continue - locations_container_table = locations_container_td.find('table', recursive=False) - if not locations_container_table: - continue - locations_container_tbody = locations_container_table.find('tbody', recursive=False) - locations = locations_container_tbody.find_all('td') - for location in locations: - game_locations[game.get_text(strip=True)] = location.get_text() - print(f'{game.get_text(strip=True)}: {location.get_text()}') - - events_section = soup.find('span', id='In_events') - if events_section: - event_header = events_section.parent - tables = {} - variant = "" - for sibling in event_header.find_next_siblings(): - if sibling.name == 'h4': - break - if sibling.name == 'h5': - variant = sibling.get_text(strip=True) - if sibling.name == 'table': - tables[variant] = sibling - for variant in tables: - print(variant) - - return game_locations - + return False def check_alternative_sources(pokemon, cache): # This function will check alternative sources for Pokémon with "Unknown" encounter types @@ -551,7 +649,7 @@ def handle_unknown_encounters(pokemon_list, cache): if __name__ == "__main__": get_cached_data() - pokemon_list = read_pokemon_list('pokemon_home_list.csv', limit=3000) + pokemon_list = read_pokemon_list('pokemon_home_list.csv', limit=151) pokemon_list_with_games = determine_earliest_games(pokemon_list, cache) pokemon_list_adjusted = adjust_for_evolution(pokemon_list_with_games, cache) pokemon_list_with_locations = add_encounter_locations(pokemon_list_adjusted, cache)