Add readmes, explanation and link for chromedriver, clear notebook outputs

merge-requests/25/head^2
Brian 2019-08-09 14:28:18 +00:00 committed by James Miller
parent dfefa0151c
commit 328066a03f
5 changed files with 141 additions and 103 deletions

9
src/README.md 100644
View File

@ -0,0 +1,9 @@
# The purpose of this directory is to perform scraping of [Archives of Nethys](https://2e.aonprd.com)
## Requirements
1. Python 3.6.8
2. pandas==0.24.2
3. splinter==0.11.0
4. beautifulsoup4==4.8.0
5. selenium==3.141.0
6. Download the [chrome WebDriver](https://splinter.readthedocs.io/en/latest/drivers/chrome.html) and place it in this directory. It is required by selenium and splinter as using Requests was prone to failure.

View File

@ -0,0 +1,4 @@
pandas==0.24.2
splinter==0.11.0
beautifulsoup4==4.8.0
selenium==3.141.0

View File

@ -0,0 +1,7 @@
# This directory scrapes the weapons from the [Archives of Nethys](https://2e.aonprd.com/Weapons.aspx)
## Steps to scrape the weapons
1. Install the requirements from [the previous readme](../README.md)
2. Generate .csv files from copy pasting the tables from [here](https://2e.aonprd.com/Weapons.aspx) and save them in this directory
3. Set the number_of_weapons variable to the number of weapons in the database it is currently 83
4. Run the [python file](scrape.py) or [Jupyter Notebook](scrape.ipynb)

View File

@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -20,7 +20,7 @@
"from splinter import Browser\n",
"\n",
"# Setting up Selenium\n",
"chrome_driver = os.path.join('..', 'resources', 'chromedriver.exe')\n",
"chrome_driver = os.path.join('..', 'chromedriver.exe')\n",
"executable_path = {'executable_path': chrome_driver}\n",
"browser = Browser('chrome', **executable_path, headless=False)\n",
"\n",
@ -30,103 +30,9 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Beginning Data Retrieval\n",
"------------------------\n",
"Processing Weapon 1 of 83 | Fist\n",
"Processing Weapon 2 of 83 | Club\n",
"Processing Weapon 3 of 83 | Dagger\n",
"Processing Weapon 4 of 83 | Gauntlet\n",
"Processing Weapon 5 of 83 | Light Mace\n",
"Processing Weapon 6 of 83 | Longspear\n",
"Processing Weapon 7 of 83 | Mace\n",
"Processing Weapon 8 of 83 | Morningstar\n",
"Processing Weapon 9 of 83 | Sickle\n",
"Processing Weapon 10 of 83 | Spear\n",
"Processing Weapon 11 of 83 | Spiked Gauntlet\n",
"Processing Weapon 12 of 83 | Staff\n",
"Processing Weapon 13 of 83 | Clan Dagger\n",
"Processing Weapon 14 of 83 | Katar\n",
"Processing Weapon 15 of 83 | Bastard Sword\n",
"Processing Weapon 16 of 83 | Battle Axe\n",
"Processing Weapon 17 of 83 | Bo Staff\n",
"Processing Weapon 18 of 83 | Falchion\n",
"Processing Weapon 19 of 83 | Flail\n",
"Processing Weapon 20 of 83 | Glaive\n",
"Processing Weapon 21 of 83 | Greataxe\n",
"Processing Weapon 22 of 83 | Greatclub\n",
"Processing Weapon 23 of 83 | Greatpick\n",
"Processing Weapon 24 of 83 | Greatsword\n",
"Processing Weapon 25 of 83 | Guisarme\n",
"Processing Weapon 26 of 83 | Halberd\n",
"Processing Weapon 27 of 83 | Hatchet\n",
"Processing Weapon 28 of 83 | Lance\n",
"Processing Weapon 29 of 83 | Light Hammer\n",
"Processing Weapon 30 of 83 | Light Pick\n",
"Processing Weapon 31 of 83 | Longsword\n",
"Processing Weapon 32 of 83 | Main-gauche\n",
"Processing Weapon 33 of 83 | Maul\n",
"Processing Weapon 34 of 83 | Pick\n",
"Processing Weapon 35 of 83 | Ranseur\n",
"Processing Weapon 36 of 83 | Rapier\n",
"Processing Weapon 37 of 83 | Sap\n",
"Processing Weapon 38 of 83 | Scimitar\n",
"Processing Weapon 39 of 83 | Scythe\n",
"Processing Weapon 40 of 83 | Shield Bash\n",
"Processing Weapon 41 of 83 | Shield Boss\n",
"Processing Weapon 42 of 83 | Shield Spikes\n",
"Processing Weapon 43 of 83 | Shortsword\n",
"Processing Weapon 44 of 83 | Starknife\n",
"Processing Weapon 45 of 83 | Trident\n",
"Processing Weapon 46 of 83 | War Flail\n",
"Processing Weapon 47 of 83 | Warhammer\n",
"Processing Weapon 48 of 83 | Whip\n",
"Processing Weapon 49 of 83 | Dogslicer\n",
"Processing Weapon 50 of 83 | Elven Curve Blade\n",
"Processing Weapon 51 of 83 | Filcher's Fork\n",
"Processing Weapon 52 of 83 | Gnome Hooked Hammer\n",
"Processing Weapon 53 of 83 | Horsechopper\n",
"Processing Weapon 54 of 83 | Kama\n",
"Processing Weapon 55 of 83 | Katana\n",
"Processing Weapon 56 of 83 | Kukri\n",
"Processing Weapon 57 of 83 | Nunchaku\n",
"Processing Weapon 58 of 83 | Orc Knuckle Dagger\n",
"Processing Weapon 59 of 83 | Sai\n",
"Processing Weapon 60 of 83 | Spiked Chain\n",
"Processing Weapon 61 of 83 | Temple Sword\n",
"Processing Weapon 62 of 83 | Dwarven War Axe\n",
"Processing Weapon 63 of 83 | Gnome Flickmace\n",
"Processing Weapon 64 of 83 | Orc Necksplitter\n",
"Processing Weapon 65 of 83 | Sawtooth Saber\n",
"Processing Weapon 66 of 83 | Blowgun\n",
"Processing Weapon 67 of 83 | Crossbow\n",
"Processing Weapon 68 of 83 | Dart\n",
"Processing Weapon 69 of 83 | Hand Crossbow\n",
"Processing Weapon 70 of 83 | Heavy Crossbow\n",
"Processing Weapon 71 of 83 | Javelin\n",
"Processing Weapon 72 of 83 | Sling\n",
"Processing Weapon 73 of 83 | Alchemical Bomb\n",
"Processing Weapon 74 of 83 | Composite Longbow\n",
"Processing Weapon 75 of 83 | Composite Shortbow\n",
"Processing Weapon 76 of 83 | Longbow\n",
"Processing Weapon 77 of 83 | Shortbow\n",
"Processing Weapon 78 of 83 | Halfling Sling Staff\n",
"Processing Weapon 79 of 83 | Shuriken\n",
"Processing Weapon 80 of 83 | Blowgun Darts\n",
"Processing Weapon 81 of 83 | Bolts\n",
"Processing Weapon 82 of 83 | Sling Bullets\n",
"Processing Weapon 83 of 83 | Arrows\n",
"------------------------\n",
"Data Retrieval Complete\n"
]
}
],
"outputs": [],
"source": [
"# url that contains all the links\n",
"url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID='\n",
@ -177,7 +83,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -188,7 +94,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -198,7 +104,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -207,7 +113,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -216,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [

View File

@ -0,0 +1,112 @@
#!/usr/bin/env python
# coding: utf-8
# # Scrape data from aon2e and generate csvs to import in to sqlite
# In[ ]:
# Dependencies
import pandas as pd
from bs4 import BeautifulSoup as bs
import os
from splinter import Browser
# Setting up Selenium
chrome_driver = os.path.join('..', 'chromedriver.exe')
executable_path = {'executable_path': chrome_driver}
browser = Browser('chrome', **executable_path, headless=False)
# Pandas config
pd.set_option('display.max_columns', None)
# In[ ]:
# url that contains all the links
url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID='
# Number of weapons
number_of_weapons = 83
# Empty lists to store the scraped values
name_list = []
description_list = []
print(f'Beginning Data Retrieval')
print(f'------------------------')
# Loop from 1 to the value in weapon_number
for weapon in range(1, number_of_weapons+1):
url = url_weapon + str(weapon)
browser.visit(url)
html = browser.html
soup = bs(html, 'html.parser')
# Select only the content section
content = soup.find(id='ctl00_MainContent_DetailedOutput')
try:
# Store the name and description
name = content.find('a').text.strip()
except:
name = f'weapon: {weapon}'
try:
description = content.find('hr').next.text.strip()
except:
description = content.find('hr').next.strip()
print(f'Processing Weapon {weapon} of {number_of_weapons} | {name}')
# Append values to our empty lists
name_list.append(name)
description_list.append(description)
print(f'------------------------')
print(f'Data Retrieval Complete')
# In[ ]:
# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx
melee = pd.read_csv('melee.csv')
ranged = pd.read_csv('ranged.csv')
# In[ ]:
data = {'Name': name_list, 'description': description_list}
scrape = pd.DataFrame(data)
# In[ ]:
melee = melee.merge(scrape, how='left', on='Name')
# In[ ]:
ranged = ranged.merge(scrape, how='left', on='Name')
# In[ ]:
melee.to_csv('melee.csv')
ranged.to_csv('ranged.csv')
# In[ ]: