OriginDex/DataGatherers/ScrapSerebii.py


								import requests

								from bs4 import BeautifulSoup

								import csv

								import os

								import time

								import re


								def scrape_serebii_region_pokemon(url):

								    response = requests.get(url)

								    soup = BeautifulSoup(response.content, 'html.parser')


								    pokemon_list = []


								    # Find the main table containing Pokémon data

								    table = soup.find('table', class_='dextable')


								    if table:

								        rows = table.find_all('tr')[2:]  # Skip the header row and the game intro row

								        for row in rows:

								            cells = row.find_all('td')

								            if len(cells) <= 5:  # Ensure we have enough cells to check depositability. if only 5 then its not depositable in any game.

								                continue


								            number = cells[0].text.strip().lstrip('#')

								            name = cells[2].text.strip()


								                # Get the image URL

								            img_url = cells[1].find('img')['src']

								            full_img_url = f"https://www.serebii.net{img_url}"


								            pokemon_list.append({

								                'number': number,

								                'name': name,

								                'image_url': full_img_url

								            })


								    return pokemon_list


								def download_image(url, filename):

								    response = requests.get(url)

								    if response.status_code == 200:

								        with open(filename, 'wb') as f:

								            f.write(response.content)


								def sanitize_filename(filename):

								    # Define a dictionary of symbol replacements

								    symbol_replacements = {

								        '?': 'questionmark',

								        '*': 'asterisk',

								        ':': 'colon',

								        '/': 'slash',

								        '\\': 'backslash',

								        '|': 'pipe',

								        '<': 'lessthan',

								        '>': 'greaterthan',

								        '"': 'quote',

								        ' ': '_'

								    }


								    # Replace symbols with their word equivalents

								    for symbol, word in symbol_replacements.items():

								        filename = filename.replace(symbol, word)


								    # Remove any remaining invalid characters

								    return re.sub(r'[<>:"/\\|?*]', '', filename)


								def scrape_all_regions():

								    base_url = "https://www.serebii.net/pokemonhome/"

								    regions = ["kanto", "johto", "hoenn", "sinnoh", "unova", "kalos", "alola", "galar", "paldea", "hisui", "unknown"]

								    all_pokemon = []


								    for region in regions:

								        url = f"{base_url}{region}pokemon.shtml"

								        region_pokemon = scrape_serebii_region_pokemon(url)

								        all_pokemon.extend(region_pokemon)

								        print(f"Scraped {len(region_pokemon)} Pokémon from {region.capitalize()} region")

								        time.sleep(1)  # Be nice to the server


								    return all_pokemon


								def save_to_csv(pokemon_list, filename='pokemon_home_list.csv'):

								    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:

								        fieldnames = ['number', 'name']

								        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)


								        writer.writeheader()

								        for pokemon in pokemon_list:

								            writer.writerow({k: pokemon[k] for k in fieldnames})


								if __name__ == "__main__":

								    all_pokemon = scrape_all_regions()

								    save_to_csv(all_pokemon)

								    print(f"Scraped a total of {len(all_pokemon)} Pokémon and saved to pokemon_home_list.csv")


								    # Create 'images' directory if it doesn't exist

								    if not os.path.exists('images'):

								        os.makedirs('images')


								    # Download images

								    for pokemon in all_pokemon:

								        sanitized_name = sanitize_filename(pokemon['name'])

								        filename = f"images/{pokemon['number']}_{sanitized_name}.png"


								        if os.path.exists(filename):

								            print(f"Image for {pokemon['name']} already exists, skipping download")

								        else:

								            download_image(pokemon['image_url'], filename)

								            print(f"Downloaded image for {pokemon['name']}")

								            time.sleep(0.5)  # Be nice to the server


								    print("All images downloaded successfully.")