diff --git a/src/weapons/scrape.py b/src/weapons/scrape.py new file mode 100644 index 0000000..f16fb57 --- /dev/null +++ b/src/weapons/scrape.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Scrape data from aon2e and generate csvs to import in to sqlite + +# In[1]: + + +# Dependencies +import pandas as pd +from bs4 import BeautifulSoup as bs +import os +from splinter import Browser + +# Setting up Selenium +chrome_driver = os.path.join('..', 'chromedriver.exe') +executable_path = {'executable_path': chrome_driver} +browser = Browser('chrome', **executable_path, headless=False) + +# Pandas config +pd.set_option('display.max_columns', None) + + +# In[2]: + + +# url that contains all the links +url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID=' + +# Number of weapons +number_of_weapons = 83 + +# Empty lists to store the scraped values +name_list = [] +description_list = [] + +print(f'Beginning Data Retrieval') +print(f'------------------------') + +# Loop from 1 to the value in weapon_number +for weapon in range(1, number_of_weapons+1): + + url = url_weapon + str(weapon) + browser.visit(url) + html = browser.html + soup = bs(html, 'html.parser') + + # Select only the content section + content = soup.find(id='ctl00_MainContent_DetailedOutput') + + try: + # Store the name and description + name = content.find('a').text.strip() + + except: + name = f'weapon: {weapon}' + + try: + description = content.find('hr').next.text.strip() + + except: + description = content.find('hr').next.strip() + + print(f'Processing Weapon {weapon} of {number_of_weapons} | {name}') + + # Append values to our empty lists + name_list.append(name) + description_list.append(description) + +print(f'------------------------') +print(f'Data Retrieval Complete') + + +# In[3]: + + +# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx +melee = pd.read_csv('melee.csv') +ranged = pd.read_csv('ranged.csv') + + +# In[13]: + + +data = {'Name': name_list, 'description': description_list} +scrape = pd.DataFrame(data) + + +# In[16]: + + +melee = melee.merge(scrape, how='left', on='Name') + + +# In[17]: + + +ranged = ranged.merge(scrape, how='left', on='Name') + + +# In[18]: + + +melee.to_csv('melee.csv') +ranged.to_csv('ranged.csv') + + +# In[ ]: + + + +