pathfinder-2-sqlite-MIRROR/src/weapons/scrape.py

113 lines
2.1 KiB
Python
Raw Normal View History

2019-08-09 01:34:26 -04:00
#!/usr/bin/env python
# coding: utf-8
# # Scrape data from aon2e and generate csvs to import in to sqlite
# In[1]:
# Dependencies
import pandas as pd
from bs4 import BeautifulSoup as bs
import os
from splinter import Browser
# Setting up Selenium
chrome_driver = os.path.join('..', 'chromedriver.exe')
executable_path = {'executable_path': chrome_driver}
browser = Browser('chrome', **executable_path, headless=False)
# Pandas config
pd.set_option('display.max_columns', None)
# In[2]:
# url that contains all the links
url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID='
# Number of weapons
number_of_weapons = 83
# Empty lists to store the scraped values
name_list = []
description_list = []
print(f'Beginning Data Retrieval')
print(f'------------------------')
# Loop from 1 to the value in weapon_number
for weapon in range(1, number_of_weapons+1):
url = url_weapon + str(weapon)
browser.visit(url)
html = browser.html
soup = bs(html, 'html.parser')
# Select only the content section
content = soup.find(id='ctl00_MainContent_DetailedOutput')
try:
# Store the name and description
name = content.find('a').text.strip()
except:
name = f'weapon: {weapon}'
try:
description = content.find('hr').next.text.strip()
except:
description = content.find('hr').next.strip()
print(f'Processing Weapon {weapon} of {number_of_weapons} | {name}')
# Append values to our empty lists
name_list.append(name)
description_list.append(description)
print(f'------------------------')
print(f'Data Retrieval Complete')
# In[3]:
# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx
melee = pd.read_csv('melee.csv')
ranged = pd.read_csv('ranged.csv')
# In[13]:
data = {'Name': name_list, 'description': description_list}
scrape = pd.DataFrame(data)
# In[16]:
melee = melee.merge(scrape, how='left', on='Name')
# In[17]:
ranged = ranged.merge(scrape, how='left', on='Name')
# In[18]:
melee.to_csv('melee.csv')
ranged.to_csv('ranged.csv')
# In[ ]: