You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

111 lines
3.7 KiB

import requests
from bs4 import BeautifulSoup
import csv
import os
import time
import re
def scrape_serebii_region_pokemon(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
pokemon_list = []
# Find the main table containing Pokémon data
table = soup.find('table', class_='dextable')
if table:
rows = table.find_all('tr')[2:] # Skip the header row and the game intro row
for row in rows:
cells = row.find_all('td')
if len(cells) <= 5: # Ensure we have enough cells to check depositability. if only 5 then its not depositable in any game.
continue
number = cells[0].text.strip().lstrip('#')
name = cells[2].text.strip()
# Get the image URL
img_url = cells[1].find('img')['src']
full_img_url = f"https://www.serebii.net{img_url}"
pokemon_list.append({
'number': number,
'name': name,
'image_url': full_img_url
})
return pokemon_list
def download_image(url, filename):
response = requests.get(url)
if response.status_code == 200:
with open(filename, 'wb') as f:
f.write(response.content)
def sanitize_filename(filename):
# Define a dictionary of symbol replacements
symbol_replacements = {
'?': 'questionmark',
'*': 'asterisk',
':': 'colon',
'/': 'slash',
'\\': 'backslash',
'|': 'pipe',
'<': 'lessthan',
'>': 'greaterthan',
'"': 'quote',
' ': '_'
}
# Replace symbols with their word equivalents
for symbol, word in symbol_replacements.items():
filename = filename.replace(symbol, word)
# Remove any remaining invalid characters
return re.sub(r'[<>:"/\\|?*]', '', filename)
def scrape_all_regions():
base_url = "https://www.serebii.net/pokemonhome/"
regions = ["kanto", "johto", "hoenn", "sinnoh", "unova", "kalos", "alola", "galar", "paldea", "hisui", "unknown"]
all_pokemon = []
for region in regions:
url = f"{base_url}{region}pokemon.shtml"
region_pokemon = scrape_serebii_region_pokemon(url)
all_pokemon.extend(region_pokemon)
print(f"Scraped {len(region_pokemon)} Pokémon from {region.capitalize()} region")
time.sleep(1) # Be nice to the server
return all_pokemon
def save_to_csv(pokemon_list, filename='pokemon_home_list.csv'):
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['number', 'name']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for pokemon in pokemon_list:
writer.writerow({k: pokemon[k] for k in fieldnames})
if __name__ == "__main__":
all_pokemon = scrape_all_regions()
save_to_csv(all_pokemon)
print(f"Scraped a total of {len(all_pokemon)} Pokémon and saved to pokemon_home_list.csv")
# Create 'images' directory if it doesn't exist
if not os.path.exists('images'):
os.makedirs('images')
# Download images
for pokemon in all_pokemon:
sanitized_name = sanitize_filename(pokemon['name'])
filename = f"images/{pokemon['number']}_{sanitized_name}.png"
if os.path.exists(filename):
print(f"Image for {pokemon['name']} already exists, skipping download")
else:
download_image(pokemon['image_url'], filename)
print(f"Downloaded image for {pokemon['name']}")
time.sleep(0.5) # Be nice to the server
print("All images downloaded successfully.")