Browse Source

- Large changes to the way i get pokemon data

master
Dan 1 year ago
parent
commit
850303826e
  1. 298
      Utilities/DetermineOriginGame.py

298
Utilities/DetermineOriginGame.py

@ -5,7 +5,8 @@ import json
import os
import re
import sqlite3
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag, NavigableString
import copy
# Initialize the database connection
conn = sqlite3.connect('pokemon_cache.db')
@ -26,13 +27,13 @@ all_games = [
"Crystal", "Gold", "Silver",
"Emerald", "FireRed", "LeafGreen", "Ruby", "Sapphire",
"Platinum", "HeartGold", "SoulSilver", "Diamond", "Pearl",
"Black-2", "White-2", "Black", "White",
"X", "Y", "Omega-Ruby", "Alpha-Sapphire",
"Ultra-Sun", "Ultra-Moon", "Sun", "Moon",
"Sword", "Shield",
"Brilliant-Diamond", "Shining-Pearl",
"Legends-Arceus",
"Scarlet", "Violet",
"Black 2", "White 2", "Black", "White",
"X", "Y", "Omega Ruby", "Alpha Sapphire",
"Ultra Sun", "Ultra Moon", "Sun", "Moon",
"Sword", "Shield", "Expansion Pass",
"Brilliant Diamond", "Shining Pearl",
"Legends: Arceus",
"Scarlet", "Violet", "The Teal Mask", "The Hidden Treasure of Area Zero",
"Unknown"
]
@ -59,7 +60,7 @@ def update_cache(key, value):
if key not in cache:
cache[key] = value
new_entries_count += 1
if new_entries_count >= 10:
if new_entries_count >= 1:
save_cached_data()
time.sleep(1)
@ -234,20 +235,192 @@ def get_pokemon_encounter_data(pokemon_name, form, cache):
else:
return None
def get_earliest_game(encounter_data):
def split_td_contents(td):
groups = []
current_group = []
for content in td.contents:
if isinstance(content, Tag) and content.name == 'br':
if current_group:
groups.append(BeautifulSoup('', 'html.parser').new_tag('div'))
for item in current_group:
groups[-1].append(copy.copy(item))
current_group = []
else:
current_group.append(content)
if current_group:
groups.append(BeautifulSoup('', 'html.parser').new_tag('div'))
for item in current_group:
groups[-1].append(copy.copy(item))
return groups
def parse_form_information(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
form_info = soup.find('small')
if form_info:
form_text = form_info.get_text(strip=True)
# Remove parentheses
form_text = form_text.strip('()')
# Split the text into main form and breed (if present)
parts = form_text.split('(')
main_form = parts[0].strip()
breed = parts[1].strip(')') if len(parts) > 1 else None
return main_form, breed
return None, None
def get_locations_from_bulbapedia(pokemon_name, form, cache):
page_data = get_pokemon_data_bulbapedia(pokemon_name, cache)
if not page_data:
return None
soup = BeautifulSoup(page_data, 'html.parser')
locations_section = soup.find('span', id='Game_locations')
if not locations_section:
return None
locations_table = locations_section.find_next('table', class_='roundy')
if not locations_table:
return None
raw_game_locations = {}
# Ok so the table is a bit of a mess. It has some nested tables and stuff.
# In each row is a nested table with all the games in a generation.
# Next is another nexted table, but i can't tell what for.
# within that nested table, is another nested table with the games, either the release pair or a single game spanning two columns.
# Next to that is another nested table with the locations.
generation_tbody = locations_table.find('tbody', recursive=False)
generation_rows = generation_tbody.find_all('tr', recursive=False)
for generation_row in generation_rows:
random_nested_td = generation_row.find('td', recursive=False)
if not random_nested_td:
continue
random_nested_table = random_nested_td.find('table', recursive=False)
if not random_nested_table:
continue
random_nested_tbody = random_nested_table.find('tbody', recursive=False)
random_nested_rows = random_nested_tbody.find_all('tr', recursive=False)
for nested_row in random_nested_rows:
if 'Generation' in nested_row.get_text(strip=True):
continue
games_container_td = nested_row.find('td', recursive=False)
if not games_container_td:
continue
games_container_table = games_container_td.find('table', recursive=False)
if not games_container_table:
continue
games_container_tbody = games_container_table.find('tbody', recursive=False)
games_container_rows = games_container_tbody.find_all('tr', recursive=False)
for games_container_row in games_container_rows:
games = games_container_row.find_all('th')
for game in games:
raw_game = game.get_text(strip=True)
if raw_game not in all_games:
continue
locations_container_td = games_container_row.find('td', recursive=False)
if not locations_container_td:
continue
locations_container_table = locations_container_td.find('table', recursive=False)
if not locations_container_table:
continue
locations_container_tbody = locations_container_table.find('tbody', recursive=False)
locations = locations_container_tbody.find_all('td')
for location in locations:
groups = split_td_contents(location)
for group in groups:
if raw_game not in raw_game_locations:
raw_game_locations[raw_game] = []
raw_game_locations[raw_game].append(group)
events_section = soup.find('span', id='In_events')
event_tables = {}
if events_section:
event_header = events_section.parent
variant = ""
for sibling in event_header.find_next_siblings():
if sibling.name == 'h4':
break
if sibling.name == 'h5':
variant = sibling.get_text(strip=True)
if sibling.name == 'table':
event_tables[variant] = sibling
game_locations = {}
for raw_game, raw_locations in raw_game_locations.items():
if form is None:
for raw_location in raw_locations:
locations = raw_location.get_text().split(',')
for location in locations:
if raw_game not in game_locations:
game_locations[raw_game] = []
game_locations[raw_game].append(location.strip())
else:
for raw_location in raw_locations:
main_form, sub_form = parse_form_information(str(raw_location))
if main_form == form:
locations = raw_location.get_text().split(',')
for location in locations:
if raw_game not in game_locations:
game_locations[raw_game] = []
game_locations[raw_game].append(location.strip())
# For Later
for variant in event_tables:
if variant == pokemon_name or (form and form in variant):
games_container_rows = event_tables[variant].find_all('tr')
for game_row in games_container_rows:
entries = game_row.find_all('td')
if len(entries) > 1:
games_string = entries[0].find('a').get('title')
for game in all_games:
if game in games_string:
game_locations[game] = "Event"
return game_locations
def get_earliest_game(encounter_data, pokemon_name, form):
if not encounter_data:
return "Unknown", "Unknown"
non_catchable_methods = ["trade", "event", "global link", "poké transfer", "time capsule", "unobtainable", "pokémon home"]
game_methods = {}
for location_area in encounter_data:
for version_detail in location_area['version_details']:
game = version_detail['version']['name']
is_gift = any(method['method']['name'] == 'gift' for method in version_detail['encounter_details'])
for game, locations in encounter_data.items():
for location in locations:
method = "Catchable"
for non_catchable in non_catchable_methods:
if non_catchable in location.lower():
method = None
break
if method is None:
continue
if "first partner" in location.lower():
method = "Starter"
elif "received" in location.lower():
method = "Gift"
elif "evolve" in location.lower():
method = "Evolve"
else:
method = "Catchable"
if method:
if game not in game_methods:
game_methods[game] = "Gift" if is_gift else "Catchable"
elif game_methods[game] == "Gift" and not is_gift:
game_methods[game] = "Catchable"
game_methods[game.lower()] = method
else:
if method == "Catchable":
game_methods[game.lower()] = method
for game in all_games:
if game.lower() in game_methods:
@ -257,10 +430,14 @@ def get_earliest_game(encounter_data):
def determine_earliest_games(pokemon_list, cache):
for pokemon in pokemon_list:
pokemon_data = get_pokemon_data(pokemon['base_name'], pokemon['form'], cache)
encounter_data = get_pokemon_encounter_data(pokemon['base_name'], pokemon['form'], cache)
pokemon['earliest_game'], pokemon['obtain_method'] = get_earliest_game(encounter_data)
print(f"Processing {pokemon['name']} (#{pokemon['number']})")
encounter_data = get_locations_from_bulbapedia(pokemon['base_name'], pokemon['form'], cache)
pokemon['earliest_game'], pokemon['obtain_method'] = get_earliest_game(encounter_data, pokemon['base_name'], pokemon['form'])
print(f"Processed {pokemon['name']} (#{pokemon['number']}): {pokemon['earliest_game']} ({pokemon['obtain_method']})")
#pokemon_data = get_pokemon_data(pokemon['base_name'], pokemon['form'], cache)
#encounter_data = get_pokemon_encounter_data(pokemon['base_name'], pokemon['form'], cache)
#pokemon['earliest_game'], pokemon['obtain_method'] = get_earliest_game(encounter_data)
#print(f"Processed {pokemon['name']} (#{pokemon['number']}): {pokemon['earliest_game']} ({pokemon['obtain_method']})")
return pokemon_list
def get_species_data(pokemon_name, cache):
@ -426,85 +603,6 @@ def is_event_pokemon(pokemon_name, cache):
return False
def get_locations_from_bulbapedia(pokemon_name, form, cache):
page_data = get_pokemon_data_bulbapedia(pokemon_name, cache)
if not page_data:
return None
soup = BeautifulSoup(page_data, 'html.parser')
locations_section = soup.find('span', id='Game_locations')
if not locations_section:
return None
locations_table = locations_section.find_next('table', class_='roundy')
if not locations_table:
return None
game_locations = {}
# Ok so the table is a bit of a mess. It has some nested tables and stuff.
# In each row is a nested table with all the games in a generation.
# Next is another nexted table, but i can't tell what for.
# within that nested table, is another nested table with the games, either the release pair or a single game spanning two columns.
# Next to that is another nested table with the locations.
generation_tbody = locations_table.find('tbody', recursive=False)
generation_rows = generation_tbody.find_all('tr', recursive=False)
for generation_row in generation_rows:
random_nested_td = generation_row.find('td', recursive=False)
if not random_nested_td:
continue
random_nested_table = random_nested_td.find('table', recursive=False)
if not random_nested_table:
continue
random_nested_tbody = random_nested_table.find('tbody', recursive=False)
random_nested_rows = random_nested_tbody.find_all('tr', recursive=False)
for nested_row in random_nested_rows:
if 'Generation' in nested_row.get_text(strip=True):
continue
games_container_td = nested_row.find('td', recursive=False)
if not games_container_td:
continue
games_container_table = games_container_td.find('table', recursive=False)
if not games_container_table:
continue
games_container_tbody = games_container_table.find('tbody', recursive=False)
games_container_rows = games_container_tbody.find_all('tr', recursive=False)
for games_container_row in games_container_rows:
games = games_container_row.find_all('th')
for game in games:
locations_container_td = games_container_row.find('td', recursive=False)
if not locations_container_td:
continue
locations_container_table = locations_container_td.find('table', recursive=False)
if not locations_container_table:
continue
locations_container_tbody = locations_container_table.find('tbody', recursive=False)
locations = locations_container_tbody.find_all('td')
for location in locations:
game_locations[game.get_text(strip=True)] = location.get_text()
print(f'{game.get_text(strip=True)}: {location.get_text()}')
events_section = soup.find('span', id='In_events')
if events_section:
event_header = events_section.parent
tables = {}
variant = ""
for sibling in event_header.find_next_siblings():
if sibling.name == 'h4':
break
if sibling.name == 'h5':
variant = sibling.get_text(strip=True)
if sibling.name == 'table':
tables[variant] = sibling
for variant in tables:
print(variant)
return game_locations
def check_alternative_sources(pokemon, cache):
# This function will check alternative sources for Pokémon with "Unknown" encounter types
species_data = get_species_data(pokemon['base_name'], cache)
@ -551,7 +649,7 @@ def handle_unknown_encounters(pokemon_list, cache):
if __name__ == "__main__":
get_cached_data()
pokemon_list = read_pokemon_list('pokemon_home_list.csv', limit=3000)
pokemon_list = read_pokemon_list('pokemon_home_list.csv', limit=151)
pokemon_list_with_games = determine_earliest_games(pokemon_list, cache)
pokemon_list_adjusted = adjust_for_evolution(pokemon_list_with_games, cache)
pokemon_list_with_locations = add_encounter_locations(pokemon_list_adjusted, cache)

Loading…
Cancel
Save