|
|
|
@ -5,7 +5,8 @@ import json |
|
|
|
import os |
|
|
|
import re |
|
|
|
import sqlite3 |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
from bs4 import BeautifulSoup, Tag, NavigableString |
|
|
|
import copy |
|
|
|
|
|
|
|
# Initialize the database connection |
|
|
|
conn = sqlite3.connect('pokemon_cache.db') |
|
|
|
@ -26,13 +27,13 @@ all_games = [ |
|
|
|
"Crystal", "Gold", "Silver", |
|
|
|
"Emerald", "FireRed", "LeafGreen", "Ruby", "Sapphire", |
|
|
|
"Platinum", "HeartGold", "SoulSilver", "Diamond", "Pearl", |
|
|
|
"Black-2", "White-2", "Black", "White", |
|
|
|
"X", "Y", "Omega-Ruby", "Alpha-Sapphire", |
|
|
|
"Ultra-Sun", "Ultra-Moon", "Sun", "Moon", |
|
|
|
"Sword", "Shield", |
|
|
|
"Brilliant-Diamond", "Shining-Pearl", |
|
|
|
"Legends-Arceus", |
|
|
|
"Scarlet", "Violet", |
|
|
|
"Black 2", "White 2", "Black", "White", |
|
|
|
"X", "Y", "Omega Ruby", "Alpha Sapphire", |
|
|
|
"Ultra Sun", "Ultra Moon", "Sun", "Moon", |
|
|
|
"Sword", "Shield", "Expansion Pass", |
|
|
|
"Brilliant Diamond", "Shining Pearl", |
|
|
|
"Legends: Arceus", |
|
|
|
"Scarlet", "Violet", "The Teal Mask", "The Hidden Treasure of Area Zero", |
|
|
|
"Unknown" |
|
|
|
] |
|
|
|
|
|
|
|
@ -59,7 +60,7 @@ def update_cache(key, value): |
|
|
|
if key not in cache: |
|
|
|
cache[key] = value |
|
|
|
new_entries_count += 1 |
|
|
|
if new_entries_count >= 10: |
|
|
|
if new_entries_count >= 1: |
|
|
|
save_cached_data() |
|
|
|
time.sleep(1) |
|
|
|
|
|
|
|
@ -234,20 +235,192 @@ def get_pokemon_encounter_data(pokemon_name, form, cache): |
|
|
|
else: |
|
|
|
return None |
|
|
|
|
|
|
|
def get_earliest_game(encounter_data): |
|
|
|
def split_td_contents(td): |
|
|
|
groups = [] |
|
|
|
current_group = [] |
|
|
|
|
|
|
|
for content in td.contents: |
|
|
|
if isinstance(content, Tag) and content.name == 'br': |
|
|
|
if current_group: |
|
|
|
groups.append(BeautifulSoup('', 'html.parser').new_tag('div')) |
|
|
|
for item in current_group: |
|
|
|
groups[-1].append(copy.copy(item)) |
|
|
|
current_group = [] |
|
|
|
else: |
|
|
|
current_group.append(content) |
|
|
|
|
|
|
|
if current_group: |
|
|
|
groups.append(BeautifulSoup('', 'html.parser').new_tag('div')) |
|
|
|
for item in current_group: |
|
|
|
groups[-1].append(copy.copy(item)) |
|
|
|
|
|
|
|
return groups |
|
|
|
|
|
|
|
def parse_form_information(html_content): |
|
|
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
|
|
form_info = soup.find('small') |
|
|
|
|
|
|
|
if form_info: |
|
|
|
form_text = form_info.get_text(strip=True) |
|
|
|
# Remove parentheses |
|
|
|
form_text = form_text.strip('()') |
|
|
|
|
|
|
|
# Split the text into main form and breed (if present) |
|
|
|
parts = form_text.split('(') |
|
|
|
main_form = parts[0].strip() |
|
|
|
breed = parts[1].strip(')') if len(parts) > 1 else None |
|
|
|
|
|
|
|
return main_form, breed |
|
|
|
|
|
|
|
return None, None |
|
|
|
|
|
|
|
def get_locations_from_bulbapedia(pokemon_name, form, cache): |
|
|
|
page_data = get_pokemon_data_bulbapedia(pokemon_name, cache) |
|
|
|
if not page_data: |
|
|
|
return None |
|
|
|
|
|
|
|
soup = BeautifulSoup(page_data, 'html.parser') |
|
|
|
|
|
|
|
locations_section = soup.find('span', id='Game_locations') |
|
|
|
if not locations_section: |
|
|
|
return None |
|
|
|
|
|
|
|
locations_table = locations_section.find_next('table', class_='roundy') |
|
|
|
if not locations_table: |
|
|
|
return None |
|
|
|
|
|
|
|
raw_game_locations = {} |
|
|
|
|
|
|
|
# Ok so the table is a bit of a mess. It has some nested tables and stuff. |
|
|
|
# In each row is a nested table with all the games in a generation. |
|
|
|
# Next is another nexted table, but i can't tell what for. |
|
|
|
# within that nested table, is another nested table with the games, either the release pair or a single game spanning two columns. |
|
|
|
# Next to that is another nested table with the locations. |
|
|
|
|
|
|
|
generation_tbody = locations_table.find('tbody', recursive=False) |
|
|
|
generation_rows = generation_tbody.find_all('tr', recursive=False) |
|
|
|
for generation_row in generation_rows: |
|
|
|
random_nested_td = generation_row.find('td', recursive=False) |
|
|
|
if not random_nested_td: |
|
|
|
continue |
|
|
|
random_nested_table = random_nested_td.find('table', recursive=False) |
|
|
|
if not random_nested_table: |
|
|
|
continue |
|
|
|
random_nested_tbody = random_nested_table.find('tbody', recursive=False) |
|
|
|
random_nested_rows = random_nested_tbody.find_all('tr', recursive=False) |
|
|
|
for nested_row in random_nested_rows: |
|
|
|
if 'Generation' in nested_row.get_text(strip=True): |
|
|
|
continue |
|
|
|
|
|
|
|
games_container_td = nested_row.find('td', recursive=False) |
|
|
|
if not games_container_td: |
|
|
|
continue |
|
|
|
games_container_table = games_container_td.find('table', recursive=False) |
|
|
|
if not games_container_table: |
|
|
|
continue |
|
|
|
games_container_tbody = games_container_table.find('tbody', recursive=False) |
|
|
|
games_container_rows = games_container_tbody.find_all('tr', recursive=False) |
|
|
|
for games_container_row in games_container_rows: |
|
|
|
games = games_container_row.find_all('th') |
|
|
|
for game in games: |
|
|
|
raw_game = game.get_text(strip=True) |
|
|
|
if raw_game not in all_games: |
|
|
|
continue |
|
|
|
locations_container_td = games_container_row.find('td', recursive=False) |
|
|
|
if not locations_container_td: |
|
|
|
continue |
|
|
|
locations_container_table = locations_container_td.find('table', recursive=False) |
|
|
|
if not locations_container_table: |
|
|
|
continue |
|
|
|
locations_container_tbody = locations_container_table.find('tbody', recursive=False) |
|
|
|
locations = locations_container_tbody.find_all('td') |
|
|
|
for location in locations: |
|
|
|
groups = split_td_contents(location) |
|
|
|
for group in groups: |
|
|
|
if raw_game not in raw_game_locations: |
|
|
|
raw_game_locations[raw_game] = [] |
|
|
|
raw_game_locations[raw_game].append(group) |
|
|
|
|
|
|
|
events_section = soup.find('span', id='In_events') |
|
|
|
event_tables = {} |
|
|
|
if events_section: |
|
|
|
event_header = events_section.parent |
|
|
|
|
|
|
|
variant = "" |
|
|
|
for sibling in event_header.find_next_siblings(): |
|
|
|
if sibling.name == 'h4': |
|
|
|
break |
|
|
|
if sibling.name == 'h5': |
|
|
|
variant = sibling.get_text(strip=True) |
|
|
|
if sibling.name == 'table': |
|
|
|
event_tables[variant] = sibling |
|
|
|
|
|
|
|
game_locations = {} |
|
|
|
for raw_game, raw_locations in raw_game_locations.items(): |
|
|
|
if form is None: |
|
|
|
for raw_location in raw_locations: |
|
|
|
locations = raw_location.get_text().split(',') |
|
|
|
for location in locations: |
|
|
|
if raw_game not in game_locations: |
|
|
|
game_locations[raw_game] = [] |
|
|
|
game_locations[raw_game].append(location.strip()) |
|
|
|
else: |
|
|
|
for raw_location in raw_locations: |
|
|
|
main_form, sub_form = parse_form_information(str(raw_location)) |
|
|
|
if main_form == form: |
|
|
|
locations = raw_location.get_text().split(',') |
|
|
|
for location in locations: |
|
|
|
if raw_game not in game_locations: |
|
|
|
game_locations[raw_game] = [] |
|
|
|
game_locations[raw_game].append(location.strip()) |
|
|
|
|
|
|
|
# For Later |
|
|
|
for variant in event_tables: |
|
|
|
if variant == pokemon_name or (form and form in variant): |
|
|
|
games_container_rows = event_tables[variant].find_all('tr') |
|
|
|
for game_row in games_container_rows: |
|
|
|
entries = game_row.find_all('td') |
|
|
|
if len(entries) > 1: |
|
|
|
games_string = entries[0].find('a').get('title') |
|
|
|
for game in all_games: |
|
|
|
if game in games_string: |
|
|
|
game_locations[game] = "Event" |
|
|
|
|
|
|
|
return game_locations |
|
|
|
|
|
|
|
def get_earliest_game(encounter_data, pokemon_name, form): |
|
|
|
if not encounter_data: |
|
|
|
return "Unknown", "Unknown" |
|
|
|
|
|
|
|
non_catchable_methods = ["trade", "event", "global link", "poké transfer", "time capsule", "unobtainable", "pokémon home"] |
|
|
|
|
|
|
|
game_methods = {} |
|
|
|
for location_area in encounter_data: |
|
|
|
for version_detail in location_area['version_details']: |
|
|
|
game = version_detail['version']['name'] |
|
|
|
is_gift = any(method['method']['name'] == 'gift' for method in version_detail['encounter_details']) |
|
|
|
for game, locations in encounter_data.items(): |
|
|
|
for location in locations: |
|
|
|
method = "Catchable" |
|
|
|
|
|
|
|
for non_catchable in non_catchable_methods: |
|
|
|
if non_catchable in location.lower(): |
|
|
|
method = None |
|
|
|
break |
|
|
|
|
|
|
|
if method is None: |
|
|
|
continue |
|
|
|
|
|
|
|
if "first partner" in location.lower(): |
|
|
|
method = "Starter" |
|
|
|
elif "received" in location.lower(): |
|
|
|
method = "Gift" |
|
|
|
elif "evolve" in location.lower(): |
|
|
|
method = "Evolve" |
|
|
|
else: |
|
|
|
method = "Catchable" |
|
|
|
if method: |
|
|
|
if game not in game_methods: |
|
|
|
game_methods[game] = "Gift" if is_gift else "Catchable" |
|
|
|
elif game_methods[game] == "Gift" and not is_gift: |
|
|
|
game_methods[game] = "Catchable" |
|
|
|
game_methods[game.lower()] = method |
|
|
|
else: |
|
|
|
if method == "Catchable": |
|
|
|
game_methods[game.lower()] = method |
|
|
|
|
|
|
|
for game in all_games: |
|
|
|
if game.lower() in game_methods: |
|
|
|
@ -257,10 +430,14 @@ def get_earliest_game(encounter_data): |
|
|
|
|
|
|
|
def determine_earliest_games(pokemon_list, cache): |
|
|
|
for pokemon in pokemon_list: |
|
|
|
pokemon_data = get_pokemon_data(pokemon['base_name'], pokemon['form'], cache) |
|
|
|
encounter_data = get_pokemon_encounter_data(pokemon['base_name'], pokemon['form'], cache) |
|
|
|
pokemon['earliest_game'], pokemon['obtain_method'] = get_earliest_game(encounter_data) |
|
|
|
print(f"Processing {pokemon['name']} (#{pokemon['number']})") |
|
|
|
encounter_data = get_locations_from_bulbapedia(pokemon['base_name'], pokemon['form'], cache) |
|
|
|
pokemon['earliest_game'], pokemon['obtain_method'] = get_earliest_game(encounter_data, pokemon['base_name'], pokemon['form']) |
|
|
|
print(f"Processed {pokemon['name']} (#{pokemon['number']}): {pokemon['earliest_game']} ({pokemon['obtain_method']})") |
|
|
|
#pokemon_data = get_pokemon_data(pokemon['base_name'], pokemon['form'], cache) |
|
|
|
#encounter_data = get_pokemon_encounter_data(pokemon['base_name'], pokemon['form'], cache) |
|
|
|
#pokemon['earliest_game'], pokemon['obtain_method'] = get_earliest_game(encounter_data) |
|
|
|
#print(f"Processed {pokemon['name']} (#{pokemon['number']}): {pokemon['earliest_game']} ({pokemon['obtain_method']})") |
|
|
|
return pokemon_list |
|
|
|
|
|
|
|
def get_species_data(pokemon_name, cache): |
|
|
|
@ -426,85 +603,6 @@ def is_event_pokemon(pokemon_name, cache): |
|
|
|
|
|
|
|
return False |
|
|
|
|
|
|
|
def get_locations_from_bulbapedia(pokemon_name, form, cache): |
|
|
|
page_data = get_pokemon_data_bulbapedia(pokemon_name, cache) |
|
|
|
if not page_data: |
|
|
|
return None |
|
|
|
|
|
|
|
soup = BeautifulSoup(page_data, 'html.parser') |
|
|
|
|
|
|
|
locations_section = soup.find('span', id='Game_locations') |
|
|
|
if not locations_section: |
|
|
|
return None |
|
|
|
|
|
|
|
locations_table = locations_section.find_next('table', class_='roundy') |
|
|
|
if not locations_table: |
|
|
|
return None |
|
|
|
|
|
|
|
game_locations = {} |
|
|
|
|
|
|
|
# Ok so the table is a bit of a mess. It has some nested tables and stuff. |
|
|
|
# In each row is a nested table with all the games in a generation. |
|
|
|
# Next is another nexted table, but i can't tell what for. |
|
|
|
# within that nested table, is another nested table with the games, either the release pair or a single game spanning two columns. |
|
|
|
# Next to that is another nested table with the locations. |
|
|
|
|
|
|
|
generation_tbody = locations_table.find('tbody', recursive=False) |
|
|
|
generation_rows = generation_tbody.find_all('tr', recursive=False) |
|
|
|
for generation_row in generation_rows: |
|
|
|
random_nested_td = generation_row.find('td', recursive=False) |
|
|
|
if not random_nested_td: |
|
|
|
continue |
|
|
|
random_nested_table = random_nested_td.find('table', recursive=False) |
|
|
|
if not random_nested_table: |
|
|
|
continue |
|
|
|
random_nested_tbody = random_nested_table.find('tbody', recursive=False) |
|
|
|
random_nested_rows = random_nested_tbody.find_all('tr', recursive=False) |
|
|
|
for nested_row in random_nested_rows: |
|
|
|
if 'Generation' in nested_row.get_text(strip=True): |
|
|
|
continue |
|
|
|
|
|
|
|
games_container_td = nested_row.find('td', recursive=False) |
|
|
|
if not games_container_td: |
|
|
|
continue |
|
|
|
games_container_table = games_container_td.find('table', recursive=False) |
|
|
|
if not games_container_table: |
|
|
|
continue |
|
|
|
games_container_tbody = games_container_table.find('tbody', recursive=False) |
|
|
|
games_container_rows = games_container_tbody.find_all('tr', recursive=False) |
|
|
|
for games_container_row in games_container_rows: |
|
|
|
games = games_container_row.find_all('th') |
|
|
|
for game in games: |
|
|
|
locations_container_td = games_container_row.find('td', recursive=False) |
|
|
|
if not locations_container_td: |
|
|
|
continue |
|
|
|
locations_container_table = locations_container_td.find('table', recursive=False) |
|
|
|
if not locations_container_table: |
|
|
|
continue |
|
|
|
locations_container_tbody = locations_container_table.find('tbody', recursive=False) |
|
|
|
locations = locations_container_tbody.find_all('td') |
|
|
|
for location in locations: |
|
|
|
game_locations[game.get_text(strip=True)] = location.get_text() |
|
|
|
print(f'{game.get_text(strip=True)}: {location.get_text()}') |
|
|
|
|
|
|
|
events_section = soup.find('span', id='In_events') |
|
|
|
if events_section: |
|
|
|
event_header = events_section.parent |
|
|
|
tables = {} |
|
|
|
variant = "" |
|
|
|
for sibling in event_header.find_next_siblings(): |
|
|
|
if sibling.name == 'h4': |
|
|
|
break |
|
|
|
if sibling.name == 'h5': |
|
|
|
variant = sibling.get_text(strip=True) |
|
|
|
if sibling.name == 'table': |
|
|
|
tables[variant] = sibling |
|
|
|
for variant in tables: |
|
|
|
print(variant) |
|
|
|
|
|
|
|
return game_locations |
|
|
|
|
|
|
|
|
|
|
|
def check_alternative_sources(pokemon, cache): |
|
|
|
# This function will check alternative sources for Pokémon with "Unknown" encounter types |
|
|
|
species_data = get_species_data(pokemon['base_name'], cache) |
|
|
|
@ -551,7 +649,7 @@ def handle_unknown_encounters(pokemon_list, cache): |
|
|
|
if __name__ == "__main__": |
|
|
|
get_cached_data() |
|
|
|
|
|
|
|
pokemon_list = read_pokemon_list('pokemon_home_list.csv', limit=3000) |
|
|
|
pokemon_list = read_pokemon_list('pokemon_home_list.csv', limit=151) |
|
|
|
pokemon_list_with_games = determine_earliest_games(pokemon_list, cache) |
|
|
|
pokemon_list_adjusted = adjust_for_evolution(pokemon_list_with_games, cache) |
|
|
|
pokemon_list_with_locations = add_encounter_locations(pokemon_list_adjusted, cache) |
|
|
|
|