From 242ddc6184bf72d3293793978954b701f92e4259 Mon Sep 17 00:00:00 2001 From: Dan Date: Fri, 11 Oct 2024 16:00:31 +0100 Subject: [PATCH] location parsing progress --- DataGatherers/DetermineOriginGame.py | 5 +- DataGatherers/cache_manager.py | 1 + DataGatherers/update_location_information.py | 184 ++++++++++++++----- 3 files changed, 146 insertions(+), 44 deletions(-) diff --git a/DataGatherers/DetermineOriginGame.py b/DataGatherers/DetermineOriginGame.py index 29daadf..6312cf7 100644 --- a/DataGatherers/DetermineOriginGame.py +++ b/DataGatherers/DetermineOriginGame.py @@ -658,7 +658,10 @@ def get_locations_from_bulbapedia(pokemon_name, form, cache: CacheManager): if game in games_string: if game not in game_locations: game_locations[game] = [] - game_locations[game].append("Event") + info = {} + info["location"] = "Event" + info["tag"] = None + game_locations[game].append(info) return game_locations diff --git a/DataGatherers/cache_manager.py b/DataGatherers/cache_manager.py index 6bf1dc9..29a233d 100644 --- a/DataGatherers/cache_manager.py +++ b/DataGatherers/cache_manager.py @@ -54,6 +54,7 @@ class CacheManager: 'content': content, 'timestamp': time.time() }) + time.sleep(1) return content return None diff --git a/DataGatherers/update_location_information.py b/DataGatherers/update_location_information.py index 0b8fbc2..debf21a 100644 --- a/DataGatherers/update_location_information.py +++ b/DataGatherers/update_location_information.py @@ -3,17 +3,31 @@ from cache_manager import CacheManager from DetermineOriginGame import get_locations_from_bulbapedia from bs4 import BeautifulSoup, Tag import re +import time +import unicodedata def create_encounters_table(): conn = sqlite3.connect('pokemon_forms.db') - #cursor = conn.cursor() - #cursor.execute(''' - #CREATE TABLE IF NOT EXISTS encounters ( - # pfic TEXT, - # - #) - #''') - #conn.commit() + cursor = conn.cursor() + cursor.execute(''' + CREATE TABLE IF NOT EXISTS encounters ( + pfic TEXT, + game TEXT, + location TEXT, + day TEXT, + time TEXT, + dual_slot TEXT, + static_encounter_count INTEGER, + static_encounter BOOLEAN, + only_two BOOLEAN, + extra_text TEXT, + stars TEXT, + fishing BOOLEAN, + fishing_rod_needed TEXT, + PRIMARY KEY (pfic, game, location) + ) + ''') + conn.commit() return conn def extract_routes(s): @@ -34,6 +48,7 @@ def extract_routes(s): days = ["Mo", "Tu", "We", "Th", "Fr", "Sa", "Su"] times = ["Morning", "Day", "Night"] +rods = ["Old Rod", "Good Rod", "Super Rod"] all_games = [ "Yellow", "Red", "Blue", @@ -53,36 +68,62 @@ all_games = [ ] def find_match(search_string, string_array): - return next((item for item in string_array if search_string in item), None) + return next((item for item in string_array if item.lower() == search_string.lower()), None) def find_all_matches_from_array(string, array): - return [item for item in array if item in string] + return [item for item in array if item.lower() == string.lower()] + +# This pattern matches text within parentheses, including nested parentheses +pattern = r'\((?:[^()]*|\([^()]*\))*\)' +in_brackets = re.compile(pattern) -def extract_bracketed_text(string): - # This pattern matches text within parentheses, including nested parentheses - pattern = r'\((?:[^()]*|\([^()]*\))*\)' +def extract_bracketed_text(string, timeout=1): + results = [] + start_time = time.time() + stack = [] + start_index = -1 - # Find all matches - matches = re.findall(pattern, string) + for i, char in enumerate(string): + if char == '(': + if not stack: + start_index = i + stack.append(i) + elif char == ')': + if stack: + stack.pop() + if not stack: + results.append(string[start_index + 1:i]) + start_index = -1 + else: + print(f"Warning: Unmatched closing parenthesis at position {i}") - # Remove the outer parentheses from each match - return [match[1:-1] for match in matches] + # Handle any remaining unclosed brackets + if stack: + print(f"Warning: {len(stack)} unmatched opening parentheses") + for unmatched_start in stack: + results.append(string[unmatched_start + 1:]) -def extract_additional_information(s): - soup = BeautifulSoup(s, 'html.parser') - full_text = soup.get_text(strip=True) - sup_tags = soup.find_all('sup') - sup_text = None + return results +def extract_additional_information(s): details = {} details["days"] = [] details["times"] = [] details["dual_slot"] = None - details["only_one"] = False + details["static_encounter_count"] = 0 details["static_encounter"] = False - details["only_two"] = False - details["extra_text"] = None - details["stars"] = None + details["extra_text"] = [] + details["stars"] = [] + details["Fishing"] = False + details["Rods"] = [] + + if s is None: + return "", details + + soup = BeautifulSoup(s, 'html.parser') + full_text = soup.get_text() + sup_tags = soup.find_all('sup') + sup_text = None for sup_tag in sup_tags: sup_text = sup_tag.get_text(strip=True) @@ -93,38 +134,79 @@ def extract_additional_information(s): if find_match(sup_text, times): details["times"].append(sup_text) - bracket_text = extract_bracketed_text(full_text) - if len(bracket_text) > 0: - if bracket_text[0] in all_games: - details["dual_slot"] = bracket_text[0] + bracket_text = extract_bracketed_text(full_text, 2) - if "Only One" in bracket_text: - details["only_one"] = True - details["static_encounter"] = True + for text in bracket_text: + text = text.strip() + text_lower = text.lower() + + if text_lower in all_games: + details["dual_slot"] = text - if "Only Two" in bracket_text: - details["only_two"] = True + if "only one" in text_lower: + details["static_encounter_count"] = 1 details["static_encounter"] = True + text = re.sub(r'only one', '', text_lower, flags=re.IGNORECASE).strip() + elif "only two" in text_lower: + details["static_encounter_count"] = 2 + details["static_encounter"] = True + text = re.sub(r'only two', '', text_lower, flags=re.IGNORECASE).strip() + #elif "rod" in text_lower: + # details["static_encounter_count"] = 2 + # details["static_encounter"] = True + # text = re.sub(r'only two', '', text_lower, flags=re.IGNORECASE).strip() - if "★" in bracket_text: - details["stars"] = bracket_text + if "★" in text: + star_parts = re.findall(r'\d★,*', text) + for part in star_parts: + details["stars"].append(part.replace(',', '').strip()) + text = re.sub(r'\d★,*', '', text).strip() - details["extra_text"] = " ".join(bracket_text) + if text: + details["extra_text"].append(text) if sup_text: return full_text.replace(sup_text, ""), details else: return full_text, details + +def save_encounter(conn, pfic, game, location, days, times, dual_slot,static_encounter, static_encounter_count, extra_text, stars): + cursor = conn.cursor() + if len(days) > 0: + for day in days: + cursor.execute(''' + INSERT OR REPLACE INTO encounters + (pfic, game, location, day, time, dual_slot, static_encounter_count, static_encounter, extra_text, stars) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', (pfic, game, location, day, None, dual_slot, static_encounter_count, static_encounter, ' '.join(extra_text), ','.join(stars))) + elif len(times) > 0: + for time in times: + cursor.execute(''' + INSERT OR REPLACE INTO encounters + (pfic, game, location, day, time, dual_slot, static_encounter_count, static_encounter, extra_text, stars) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', (pfic, game, location, None, time, dual_slot, static_encounter_count, static_encounter, ' '.join(extra_text), ','.join(stars))) + else: + cursor.execute(''' + INSERT OR REPLACE INTO encounters + (pfic, game, location, day, time, dual_slot, static_encounter_count, static_encounter, extra_text, stars) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', (pfic, game, location, None, None, dual_slot, static_encounter_count, static_encounter, ' '.join(extra_text), ','.join(stars))) + conn.commit() if __name__ == "__main__": cache = CacheManager() conn = create_encounters_table() cursor = conn.cursor() - cursor.execute('SELECT DISTINCT name, form_name FROM pokemon_forms') + cursor.execute(''' + SELECT pf.PFIC, pf.name, pf.form_name, pf.national_dex + FROM pokemon_forms pf + ORDER BY pf.national_dex, pf.form_name + ''') pokemon_forms = cursor.fetchall() - for name, form in pokemon_forms: + for pfic, name, form, national_dex in pokemon_forms: print(f"Processing {name} {form if form else ''}") if form and name in form: @@ -138,6 +220,9 @@ if __name__ == "__main__": encounters_to_ignore = ["trade", "time capsule", "unobtainable", "evolve", "tradeversion", "poké transfer", "friend safari"] encounter_data = get_locations_from_bulbapedia(name, form, cache) + if encounter_data == None: + continue + for encounter in encounter_data: if len(encounter_data[encounter]) == 0: continue @@ -162,8 +247,21 @@ if __name__ == "__main__": print(f"Found in {encounter}:") print_encounter = False - routes, remaining = extract_routes(location["location"].strip()) + remaining, details = extract_additional_information(location["tag"]) + routes, remaining = extract_routes(remaining) print(f"Routes: {routes}") - remaining_locations, details = extract_additional_information(location["tag"]) - print(f"Remaining: {remaining_locations}") + print(f"Remaining: {remaining.strip()}") print(f"Details: {details}") + + if len(details["days"]) > 0 and len(details["times"]) > 0: + print("Stupid Data") + + for route in routes: + route_name = f"Route {route}" + save_encounter(conn, pfic, encounter, route_name, details["days"], details["times"], details["dual_slot"], details["static_encounter"], details["static_encounter_count"], details["extra_text"], details["stars"]) + + if remaining != "": + remaining_locations = remaining.replace(" and ", ",").split(",") + for remaining_location in remaining_locations: + save_encounter(conn, pfic, encounter, remaining_location.strip(), details["days"], details["times"], details["dual_slot"], details["static_encounter"], details["static_encounter_count"], details["extra_text"], details["stars"]) + conn.close()