Browse Source

- Work on improving the location extraction

master
Quildra 1 year ago
parent
commit
e31c22ec60
  1. 70
      DataGatherers/DetermineOriginGame.py
  2. 135
      DataGatherers/update_location_information.py

70
DataGatherers/DetermineOriginGame.py

@ -386,19 +386,27 @@ def split_td_contents(td):
def parse_form_information(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
form_info = soup.find('small')
if form_info:
form_text = form_info.get_text(strip=True)
# Remove parentheses
form_text = form_text.strip('()')
# Split the text into main form and breed (if present)
parts = form_text.split('(')
main_form = parts[0].strip()
breed = parts[1].strip(')') if len(parts) > 1 else None
return main_form, breed
small_tag = soup.find('small')
# Form info is in bold inside a small tag.
if small_tag:
bold_tag = small_tag.find('b')
if bold_tag:
form_text = bold_tag.get_text(strip=True)
# Remove parentheses
form_text = form_text.strip('()')
# Split the text into main form and breed (if present)
parts = form_text.split('(')
main_form = parts[0].strip()
# "Factor"s are not actual forms, they are properties of the pokemon you can encoutner.
if main_form and "factor" in main_form.lower():
return None, None
breed = parts[1].strip(')') if len(parts) > 1 else None
return main_form, breed
return None, None
@ -608,15 +616,15 @@ def get_locations_from_bulbapedia(pokemon_name, form, cache: CacheManager):
if form is None:
for raw_location in raw_locations:
raw_text = raw_location.get_text()
raw_text = raw_text.replace(" and ", ",")
locations = raw_text.split(',')
for location in locations:
location = location.strip()
if location == "":
continue
if raw_game not in game_locations:
game_locations[raw_game] = []
game_locations[raw_game].append(location.strip())
main_form, sub_form = parse_form_information(str(raw_location))
if main_form and (main_form != "All Forms" and main_form != "Kantonian Form"):
continue
if raw_game not in game_locations:
game_locations[raw_game] = []
info = {}
info["location"] = raw_text
info["tag"] = str(raw_location)
game_locations[raw_game].append(info)
else:
for raw_location in raw_locations:
main_form, sub_form = parse_form_information(str(raw_location))
@ -630,14 +638,13 @@ def get_locations_from_bulbapedia(pokemon_name, form, cache: CacheManager):
sub_form_match = False if not sub_form else fuzz.partial_ratio(form.lower(), sub_form.lower()) >= 80
if main_form_match or sub_form_match:
locations = raw_location.get_text().replace(' and ', ',').replace('#', '').split(',')
for location in locations:
location = location.strip()
if location == "":
continue
if raw_game not in game_locations:
game_locations[raw_game] = []
game_locations[raw_game].append(location.strip())
raw_text = raw_location.get_text()
if raw_game not in game_locations:
game_locations[raw_game] = []
info = {}
info["location"] = raw_text
info["tag"] = str(raw_location)
game_locations[raw_game].append(info)
# For Later
for variant in event_tables:
@ -655,6 +662,9 @@ def get_locations_from_bulbapedia(pokemon_name, form, cache: CacheManager):
return game_locations
def split_outside_brackets(str):
return re.split(r',(?![^()]*\))', str)
def handle_unown(pokemon, encounter_data):
if not pokemon.name == "Unown":
return

135
DataGatherers/update_location_information.py

@ -1,6 +1,8 @@
import sqlite3
from cache_manager import CacheManager
from DetermineOriginGame import get_locations_from_bulbapedia
from bs4 import BeautifulSoup, Tag
import re
def create_encounters_table():
conn = sqlite3.connect('pokemon_forms.db')
@ -14,6 +16,106 @@ def create_encounters_table():
#conn.commit()
return conn
def extract_routes(s):
# Find all route numbers, including those after "and" or separated by commas
route_pattern = r'Routes?\s+((?:\d+(?:,?\s+(?:and\s+)?)?)+)'
route_match = re.search(route_pattern, s, re.IGNORECASE)
if route_match:
# Extract all numbers from the matched group
numbers = re.findall(r'\d+', route_match.group(1))
# Remove the extracted part from the original string
remaining = s[:route_match.start()] + s[route_match.end():].lstrip(', ')
return numbers, remaining
else:
return [], s
days = ["Mo", "Tu", "We", "Th", "Fr", "Sa", "Su"]
times = ["Morning", "Day", "Night"]
all_games = [
"Yellow", "Red", "Blue",
"Crystal", "Gold", "Silver",
"Emerald", "FireRed", "LeafGreen", "Ruby", "Sapphire",
"Platinum", "HeartGold", "SoulSilver", "Diamond", "Pearl",
"Black 2", "White 2", "Black", "White",
"X", "Y", "Omega Ruby", "Alpha Sapphire",
"Ultra Sun", "Ultra Moon", "Sun", "Moon",
"Sword", "Shield", "Expansion Pass",
"Brilliant Diamond", "Shining Pearl",
"Legends: Arceus",
"Scarlet", "Violet", "The Teal Mask", "The Hidden Treasure of Area Zero", "The Hidden Treasure of Area Zero (Scarlet)", "The Hidden Treasure of Area Zero (Violet)", "The Teal Mask (Scarlet)", "The Teal Mask (Violet)",
"Unknown",
"Pokémon Home",
"Pokémon Go",
]
def find_match(search_string, string_array):
return next((item for item in string_array if search_string in item), None)
def find_all_matches_from_array(string, array):
return [item for item in array if item in string]
def extract_bracketed_text(string):
# This pattern matches text within parentheses, including nested parentheses
pattern = r'\((?:[^()]*|\([^()]*\))*\)'
# Find all matches
matches = re.findall(pattern, string)
# Remove the outer parentheses from each match
return [match[1:-1] for match in matches]
def extract_additional_information(s):
soup = BeautifulSoup(s, 'html.parser')
full_text = soup.get_text(strip=True)
sup_tags = soup.find_all('sup')
sup_text = None
details = {}
details["days"] = []
details["times"] = []
details["dual_slot"] = None
details["only_one"] = False
details["static_encounter"] = False
details["only_two"] = False
details["extra_text"] = None
details["stars"] = None
for sup_tag in sup_tags:
sup_text = sup_tag.get_text(strip=True)
if find_match(sup_text, days):
details["days"].append(sup_text)
if find_match(sup_text, times):
details["times"].append(sup_text)
bracket_text = extract_bracketed_text(full_text)
if len(bracket_text) > 0:
if bracket_text[0] in all_games:
details["dual_slot"] = bracket_text[0]
if "Only One" in bracket_text:
details["only_one"] = True
details["static_encounter"] = True
if "Only Two" in bracket_text:
details["only_two"] = True
details["static_encounter"] = True
if "" in bracket_text:
details["stars"] = bracket_text
details["extra_text"] = " ".join(bracket_text)
if sup_text:
return full_text.replace(sup_text, ""), details
else:
return full_text, details
if __name__ == "__main__":
cache = CacheManager()
@ -33,16 +135,35 @@ if __name__ == "__main__":
gender = form
form = None
encounters_we_aren_t_interested_in = ["Trade", "Time Capsule", "Unobtainable"]
encounters_to_ignore = ["trade", "time capsule", "unobtainable", "evolve", "tradeversion", "poké transfer", "friend safari"]
encounter_data = get_locations_from_bulbapedia(name, form, cache)
for encounter in encounter_data:
print(f"Found in {encounter}:")
if len(encounter_data[encounter]) == 0:
continue
print_encounter = True
for location in encounter_data[encounter]:
if location in encounters_we_aren_t_interested_in:
if location == "":
continue
if "Evolve" in location:
test_location = location["location"].strip().lower()
ignore_location = False
for ignore in encounters_to_ignore:
if ignore in test_location:
ignore_location = True
break
if ignore_location:
continue
if "TradeVersion" in location:
continue
print(f" {location}")
if print_encounter:
print(f"Found in {encounter}:")
print_encounter = False
routes, remaining = extract_routes(location["location"].strip())
print(f"Routes: {routes}")
remaining_locations, details = extract_additional_information(location["tag"])
print(f"Remaining: {remaining_locations}")
print(f"Details: {details}")

Loading…
Cancel
Save