- Work on improving the location extraction

1 year ago · e31c22ec60
2 changed files with 168 additions and 37 deletions
--- a/DataGatherers/DetermineOriginGame.py
+++ b/DataGatherers/DetermineOriginGame.py
@ -386,19 +386,27 @@ def split_td_contents(td):

 def parse_form_information(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
-    form_info = soup.find('small')
-    
-    if form_info:
-        form_text = form_info.get_text(strip=True)
-        # Remove parentheses
-        form_text = form_text.strip('()')
-        
-        # Split the text into main form and breed (if present)
-        parts = form_text.split('(')
-        main_form = parts[0].strip()
-        breed = parts[1].strip(')') if len(parts) > 1 else None
-        
-        return main_form, breed
+    small_tag = soup.find('small')
+    
+    # Form info is in bold inside a small tag.
+    if small_tag:
+        bold_tag = small_tag.find('b')
+        if bold_tag:
+            form_text = bold_tag.get_text(strip=True)
+            # Remove parentheses
+            form_text = form_text.strip('()')
+            
+            # Split the text into main form and breed (if present)
+            parts = form_text.split('(')
+            main_form = parts[0].strip()
+
+            # "Factor"s are not actual forms, they are properties of the pokemon you can encoutner.
+            if main_form and "factor" in main_form.lower():
+                return None, None
+
+            breed = parts[1].strip(')') if len(parts) > 1 else None
+            
+            return main_form, breed
    
    return None, None

@ -608,15 +616,15 @@ def get_locations_from_bulbapedia(pokemon_name, form, cache: CacheManager):
        if form is None:
            for raw_location in raw_locations:
                raw_text = raw_location.get_text()
-                raw_text = raw_text.replace(" and ", ",")
-                locations = raw_text.split(',')
-                for location in locations:
-                    location = location.strip()
-                    if location == "":
-                        continue
-                    if raw_game not in game_locations:
-                        game_locations[raw_game] = []
-                    game_locations[raw_game].append(location.strip())
+                main_form, sub_form = parse_form_information(str(raw_location))
+                if main_form and (main_form != "All Forms" and main_form != "Kantonian Form"):
+                    continue
+                if raw_game not in game_locations:
+                    game_locations[raw_game] = []
+                info = {}
+                info["location"] = raw_text
+                info["tag"] = str(raw_location)
+                game_locations[raw_game].append(info)
        else:
            for raw_location in raw_locations:
                main_form, sub_form = parse_form_information(str(raw_location))
@ -630,14 +638,13 @@ def get_locations_from_bulbapedia(pokemon_name, form, cache: CacheManager):
                sub_form_match = False if not sub_form else fuzz.partial_ratio(form.lower(), sub_form.lower()) >= 80

                if main_form_match or sub_form_match:
-                    locations = raw_location.get_text().replace(' and ', ',').replace('#', '').split(',')
-                    for location in locations:
-                        location = location.strip()
-                        if location == "":
-                            continue
-                        if raw_game not in game_locations:
-                            game_locations[raw_game] = []
-                        game_locations[raw_game].append(location.strip())
+                    raw_text = raw_location.get_text()
+                    if raw_game not in game_locations:
+                        game_locations[raw_game] = []
+                    info = {}
+                    info["location"] = raw_text
+                    info["tag"] = str(raw_location)
+                    game_locations[raw_game].append(info)

    # For Later
    for variant in event_tables:
@ -655,6 +662,9 @@ def get_locations_from_bulbapedia(pokemon_name, form, cache: CacheManager):

    return game_locations

+def split_outside_brackets(str):
+    return re.split(r',(?![^()]*\))', str)
+
 def handle_unown(pokemon, encounter_data):
    if not pokemon.name == "Unown":
        return
--- a/DataGatherers/update_location_information.py
+++ b/DataGatherers/update_location_information.py
@ -1,6 +1,8 @@
 import sqlite3
 from cache_manager import CacheManager
 from DetermineOriginGame import get_locations_from_bulbapedia
+from bs4 import BeautifulSoup, Tag
+import re

 def create_encounters_table():
    conn = sqlite3.connect('pokemon_forms.db')
@ -14,6 +16,106 @@ def create_encounters_table():
    #conn.commit()
    return conn

+def extract_routes(s):
+    # Find all route numbers, including those after "and" or separated by commas
+    route_pattern = r'Routes?\s+((?:\d+(?:,?\s+(?:and\s+)?)?)+)'
+    route_match = re.search(route_pattern, s, re.IGNORECASE)
+    
+    if route_match:
+        # Extract all numbers from the matched group
+        numbers = re.findall(r'\d+', route_match.group(1))
+        
+        # Remove the extracted part from the original string
+        remaining = s[:route_match.start()] + s[route_match.end():].lstrip(', ')
+        
+        return numbers, remaining
+    else:
+        return [], s
+    
+days = ["Mo", "Tu", "We", "Th", "Fr", "Sa", "Su"]
+times = ["Morning", "Day", "Night"]
+
+all_games = [
+    "Yellow", "Red", "Blue",
+    "Crystal", "Gold", "Silver",
+    "Emerald", "FireRed", "LeafGreen", "Ruby", "Sapphire",
+    "Platinum", "HeartGold", "SoulSilver", "Diamond", "Pearl",
+    "Black 2", "White 2", "Black", "White",
+    "X", "Y", "Omega Ruby", "Alpha Sapphire",
+    "Ultra Sun", "Ultra Moon", "Sun", "Moon",
+    "Sword", "Shield", "Expansion Pass",
+    "Brilliant Diamond", "Shining Pearl",
+    "Legends: Arceus",
+    "Scarlet", "Violet", "The Teal Mask", "The Hidden Treasure of Area Zero", "The Hidden Treasure of Area Zero (Scarlet)", "The Hidden Treasure of Area Zero (Violet)", "The Teal Mask (Scarlet)", "The Teal Mask (Violet)",
+    "Unknown",
+    "Pokémon Home",
+    "Pokémon Go",
+]
+
+def find_match(search_string, string_array):
+    return next((item for item in string_array if search_string in item), None)
+
+def find_all_matches_from_array(string, array):
+    return [item for item in array if item in string]
+
+def extract_bracketed_text(string):
+    # This pattern matches text within parentheses, including nested parentheses
+    pattern = r'\((?:[^()]*|\([^()]*\))*\)'
+    
+    # Find all matches
+    matches = re.findall(pattern, string)
+    
+    # Remove the outer parentheses from each match
+    return [match[1:-1] for match in matches]
+    
+def extract_additional_information(s):
+    soup = BeautifulSoup(s, 'html.parser')
+    full_text = soup.get_text(strip=True)
+    sup_tags = soup.find_all('sup')
+    sup_text = None
+
+    details = {}
+    details["days"] = []
+    details["times"] = []
+    details["dual_slot"] = None
+    details["only_one"] = False
+    details["static_encounter"] = False
+    details["only_two"] = False
+    details["extra_text"] = None
+    details["stars"] = None
+
+    for sup_tag in sup_tags:
+        sup_text = sup_tag.get_text(strip=True)
+
+        if find_match(sup_text, days):
+            details["days"].append(sup_text)
+
+        if find_match(sup_text, times):
+            details["times"].append(sup_text)
+
+    bracket_text = extract_bracketed_text(full_text)
+    if len(bracket_text) > 0:
+        if bracket_text[0] in all_games:
+            details["dual_slot"] = bracket_text[0]
+
+        if "Only One" in bracket_text:
+            details["only_one"] = True
+            details["static_encounter"] = True
+
+        if "Only Two" in bracket_text:
+            details["only_two"] = True
+            details["static_encounter"] = True
+
+        if "★" in bracket_text:
+            details["stars"] = bracket_text
+
+    details["extra_text"] = " ".join(bracket_text)
+
+    if sup_text:
+        return full_text.replace(sup_text, ""), details
+    else:
+        return full_text, details
+
 if __name__ == "__main__":
    cache = CacheManager()

@ -33,16 +135,35 @@ if __name__ == "__main__":
            gender = form
            form = None

-        encounters_we_aren_t_interested_in = ["Trade", "Time Capsule", "Unobtainable"]
+        encounters_to_ignore = ["trade", "time capsule", "unobtainable", "evolve", "tradeversion", "poké transfer", "friend safari"]

        encounter_data = get_locations_from_bulbapedia(name, form, cache)
        for encounter in encounter_data:
-            print(f"Found in {encounter}:")
+            if len(encounter_data[encounter]) == 0:
+                continue
+
+            print_encounter = True
+
            for location in encounter_data[encounter]:
-                if location in encounters_we_aren_t_interested_in:
+                if location == "":
                    continue
-                if "Evolve" in location:
+                test_location = location["location"].strip().lower()    
+                
+                ignore_location = False
+                for ignore in encounters_to_ignore:
+                    if ignore in test_location:
+                        ignore_location = True
+                        break
+                
+                if ignore_location:
                    continue
-                if "TradeVersion" in location:
-                    continue
-                print(f"    {location}")
+                
+                if print_encounter:
+                    print(f"Found in {encounter}:")
+                    print_encounter = False
+
+                routes, remaining = extract_routes(location["location"].strip())
+                print(f"Routes: {routes}")
+                remaining_locations, details = extract_additional_information(location["tag"])
+                print(f"Remaining: {remaining_locations}")
+                print(f"Details: {details}")