You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
111 lines
3.7 KiB
111 lines
3.7 KiB
import requests
|
|
from bs4 import BeautifulSoup
|
|
import csv
|
|
import os
|
|
import time
|
|
import re
|
|
|
|
def scrape_serebii_region_pokemon(url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
pokemon_list = []
|
|
|
|
# Find the main table containing Pokémon data
|
|
table = soup.find('table', class_='dextable')
|
|
|
|
if table:
|
|
rows = table.find_all('tr')[2:] # Skip the header row and the game intro row
|
|
for row in rows:
|
|
cells = row.find_all('td')
|
|
if len(cells) <= 5: # Ensure we have enough cells to check depositability. if only 5 then its not depositable in any game.
|
|
continue
|
|
|
|
number = cells[0].text.strip().lstrip('#')
|
|
name = cells[2].text.strip()
|
|
|
|
# Get the image URL
|
|
img_url = cells[1].find('img')['src']
|
|
full_img_url = f"https://www.serebii.net{img_url}"
|
|
|
|
pokemon_list.append({
|
|
'number': number,
|
|
'name': name,
|
|
'image_url': full_img_url
|
|
})
|
|
|
|
return pokemon_list
|
|
|
|
def download_image(url, filename):
|
|
response = requests.get(url)
|
|
if response.status_code == 200:
|
|
with open(filename, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
def sanitize_filename(filename):
|
|
# Define a dictionary of symbol replacements
|
|
symbol_replacements = {
|
|
'?': 'questionmark',
|
|
'*': 'asterisk',
|
|
':': 'colon',
|
|
'/': 'slash',
|
|
'\\': 'backslash',
|
|
'|': 'pipe',
|
|
'<': 'lessthan',
|
|
'>': 'greaterthan',
|
|
'"': 'quote',
|
|
' ': '_'
|
|
}
|
|
|
|
# Replace symbols with their word equivalents
|
|
for symbol, word in symbol_replacements.items():
|
|
filename = filename.replace(symbol, word)
|
|
|
|
# Remove any remaining invalid characters
|
|
return re.sub(r'[<>:"/\\|?*]', '', filename)
|
|
|
|
def scrape_all_regions():
|
|
base_url = "https://www.serebii.net/pokemonhome/"
|
|
regions = ["kanto", "johto", "hoenn", "sinnoh", "unova", "kalos", "alola", "galar", "paldea", "hisui", "unknown"]
|
|
all_pokemon = []
|
|
|
|
for region in regions:
|
|
url = f"{base_url}{region}pokemon.shtml"
|
|
region_pokemon = scrape_serebii_region_pokemon(url)
|
|
all_pokemon.extend(region_pokemon)
|
|
print(f"Scraped {len(region_pokemon)} Pokémon from {region.capitalize()} region")
|
|
time.sleep(1) # Be nice to the server
|
|
|
|
return all_pokemon
|
|
|
|
def save_to_csv(pokemon_list, filename='pokemon_home_list.csv'):
|
|
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
|
fieldnames = ['number', 'name']
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
|
|
writer.writeheader()
|
|
for pokemon in pokemon_list:
|
|
writer.writerow({k: pokemon[k] for k in fieldnames})
|
|
|
|
if __name__ == "__main__":
|
|
all_pokemon = scrape_all_regions()
|
|
save_to_csv(all_pokemon)
|
|
print(f"Scraped a total of {len(all_pokemon)} Pokémon and saved to pokemon_home_list.csv")
|
|
|
|
# Create 'images' directory if it doesn't exist
|
|
if not os.path.exists('images'):
|
|
os.makedirs('images')
|
|
|
|
# Download images
|
|
for pokemon in all_pokemon:
|
|
sanitized_name = sanitize_filename(pokemon['name'])
|
|
filename = f"images/{pokemon['number']}_{sanitized_name}.png"
|
|
|
|
if os.path.exists(filename):
|
|
print(f"Image for {pokemon['name']} already exists, skipping download")
|
|
else:
|
|
download_image(pokemon['image_url'], filename)
|
|
print(f"Downloaded image for {pokemon['name']}")
|
|
time.sleep(0.5) # Be nice to the server
|
|
|
|
print("All images downloaded successfully.")
|
|
|