Final executable
parent
dfbf07e6d9
commit
bb5d781aaa
|
@ -1,162 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Scrape data from aon2e and generate csvs to import in to sqlite"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Dependencies\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"from bs4 import BeautifulSoup as bs\n",
|
|
||||||
"import os\n",
|
|
||||||
"from splinter import Browser\n",
|
|
||||||
"\n",
|
|
||||||
"# Setting up Selenium\n",
|
|
||||||
"chrome_driver = os.path.join('..', 'chromedriver.exe')\n",
|
|
||||||
"executable_path = {'executable_path': chrome_driver}\n",
|
|
||||||
"browser = Browser('chrome', **executable_path, headless=False)\n",
|
|
||||||
"\n",
|
|
||||||
"# Pandas config\n",
|
|
||||||
"pd.set_option('display.max_columns', None)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# url that contains all the links\n",
|
|
||||||
"url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID='\n",
|
|
||||||
"\n",
|
|
||||||
"# Number of weapons\n",
|
|
||||||
"number_of_weapons = 83\n",
|
|
||||||
"\n",
|
|
||||||
"# Empty lists to store the scraped values\n",
|
|
||||||
"name_list = []\n",
|
|
||||||
"description_list = []\n",
|
|
||||||
"\n",
|
|
||||||
"print(f'Beginning Data Retrieval')\n",
|
|
||||||
"print(f'------------------------')\n",
|
|
||||||
"\n",
|
|
||||||
"# Loop from 1 to the value in weapon_number\n",
|
|
||||||
"for weapon in range(1, number_of_weapons+1):\n",
|
|
||||||
" \n",
|
|
||||||
" url = url_weapon + str(weapon)\n",
|
|
||||||
" browser.visit(url)\n",
|
|
||||||
" html = browser.html\n",
|
|
||||||
" soup = bs(html, 'html.parser')\n",
|
|
||||||
"\n",
|
|
||||||
" # Select only the content section\n",
|
|
||||||
" content = soup.find(id='ctl00_MainContent_DetailedOutput')\n",
|
|
||||||
"\n",
|
|
||||||
" try:\n",
|
|
||||||
" # Store the name and description\n",
|
|
||||||
" name = content.find('a').text.strip()\n",
|
|
||||||
" \n",
|
|
||||||
" except:\n",
|
|
||||||
" name = f'weapon: {weapon}'\n",
|
|
||||||
"\n",
|
|
||||||
" try:\n",
|
|
||||||
" description = content.find('hr').next.text.strip()\n",
|
|
||||||
" \n",
|
|
||||||
" except:\n",
|
|
||||||
" description = content.find('hr').next.strip()\n",
|
|
||||||
" \n",
|
|
||||||
" print(f'Processing Weapon {weapon} of {number_of_weapons} | {name}')\n",
|
|
||||||
"\n",
|
|
||||||
" # Append values to our empty lists\n",
|
|
||||||
" name_list.append(name) \n",
|
|
||||||
" description_list.append(description)\n",
|
|
||||||
"\n",
|
|
||||||
"print(f'------------------------')\n",
|
|
||||||
"print(f'Data Retrieval Complete')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx\n",
|
|
||||||
"melee = pd.read_csv('melee.csv')\n",
|
|
||||||
"ranged = pd.read_csv('ranged.csv')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"data = {'Name': name_list, 'description': description_list}\n",
|
|
||||||
"scrape = pd.DataFrame(data)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"melee = melee.merge(scrape, how='left', on='Name')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"ranged = ranged.merge(scrape, how='left', on='Name')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"melee.to_csv('melee.csv')\n",
|
|
||||||
"ranged.to_csv('ranged.csv')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.6.9"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 4
|
|
||||||
}
|
|
|
@ -0,0 +1,173 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Scrape data from aon2e and generate csvs to import in to sqlite"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Dependencies\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from bs4 import BeautifulSoup as bs, Tag, NavigableString\n",
|
||||||
|
"import os\n",
|
||||||
|
"from splinter import Browser\n",
|
||||||
|
"\n",
|
||||||
|
"# Setting up Selenium\n",
|
||||||
|
"chrome_driver = os.path.join('..', 'chromedriver.exe')\n",
|
||||||
|
"executable_path = {'executable_path': chrome_driver}\n",
|
||||||
|
"browser = Browser('chrome', **executable_path, headless=False)\n",
|
||||||
|
"\n",
|
||||||
|
"# Pandas config\n",
|
||||||
|
"pd.set_option('display.max_columns', None)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def scrape_description(url, id_number):\n",
|
||||||
|
"\n",
|
||||||
|
" # Empty lists to store the scraped values\n",
|
||||||
|
" name_list = []\n",
|
||||||
|
" description_list = []\n",
|
||||||
|
"\n",
|
||||||
|
" print(f'Beginning Data Retrieval')\n",
|
||||||
|
" print(f'------------------------')\n",
|
||||||
|
"\n",
|
||||||
|
" # Loop from 1 to the value in weapon_number\n",
|
||||||
|
" for page in range(1, id_number+1):\n",
|
||||||
|
"\n",
|
||||||
|
" browser.visit(url + str(page))\n",
|
||||||
|
" html = browser.html\n",
|
||||||
|
" soup = bs(html, 'html.parser')\n",
|
||||||
|
"\n",
|
||||||
|
" # Select only the content section\n",
|
||||||
|
" content = soup.find(id='ctl00_MainContent_DetailedOutput')\n",
|
||||||
|
"\n",
|
||||||
|
" try:\n",
|
||||||
|
" # Store the name and description\n",
|
||||||
|
" name = content.find('h1', class_='title')\n",
|
||||||
|
" name.span.decompose()\n",
|
||||||
|
" name = name.text\n",
|
||||||
|
"\n",
|
||||||
|
" except:\n",
|
||||||
|
" name = f'name: {page}'\n",
|
||||||
|
"\n",
|
||||||
|
" try:\n",
|
||||||
|
" description = ''\n",
|
||||||
|
" start = content.find('hr')\n",
|
||||||
|
" for e in start.next_siblings:\n",
|
||||||
|
" if isinstance(e, Tag):\n",
|
||||||
|
" description = description + e.text.strip()\n",
|
||||||
|
" elif isinstance(e, NavigableString):\n",
|
||||||
|
" description = description + e\n",
|
||||||
|
"\n",
|
||||||
|
" except:\n",
|
||||||
|
" description = f'name: {page}'\n",
|
||||||
|
"\n",
|
||||||
|
" print(f'{page} of {id_number} | {name}')\n",
|
||||||
|
"\n",
|
||||||
|
" # Append values to our empty lists\n",
|
||||||
|
" name_list.append(name) \n",
|
||||||
|
" description_list.append(description)\n",
|
||||||
|
"\n",
|
||||||
|
" print(f'------------------------')\n",
|
||||||
|
" print(f'Data Retrieval Complete')\n",
|
||||||
|
" \n",
|
||||||
|
" # Create df with the scraped data\n",
|
||||||
|
" data = {'Name': name_list, 'description': description_list}\n",
|
||||||
|
" \n",
|
||||||
|
" # Returns a data frame\n",
|
||||||
|
" return pd.DataFrame(data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# scrape the descriptions\n",
|
||||||
|
"url_gear = 'https://2e.aonprd.com/Equipment.aspx?ID='\n",
|
||||||
|
"number_gear = 65 #65 to scrape\n",
|
||||||
|
"\n",
|
||||||
|
"gear_description = scrape_description(url_gear, number_gear)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx\n",
|
||||||
|
"gear = pd.read_csv('gear.csv')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"gear_description"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"gear = gear.merge(gear_description, how='left', on='Name')\n",
|
||||||
|
"gear['Level'].fillna(0, inplace=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"gear.to_csv('gear.csv')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
|
@ -0,0 +1,123 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
# # Scrape data from aon2e and generate csvs to import in to sqlite
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# Dependencies
|
||||||
|
import pandas as pd
|
||||||
|
from bs4 import BeautifulSoup as bs, Tag, NavigableString
|
||||||
|
import os
|
||||||
|
from splinter import Browser
|
||||||
|
|
||||||
|
# Setting up Selenium
|
||||||
|
chrome_driver = os.path.join('..', 'chromedriver.exe')
|
||||||
|
executable_path = {'executable_path': chrome_driver}
|
||||||
|
browser = Browser('chrome', **executable_path, headless=False)
|
||||||
|
|
||||||
|
# Pandas config
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_description(url, id_number):
|
||||||
|
|
||||||
|
# Empty lists to store the scraped values
|
||||||
|
name_list = []
|
||||||
|
description_list = []
|
||||||
|
|
||||||
|
print(f'Beginning Data Retrieval')
|
||||||
|
print(f'------------------------')
|
||||||
|
|
||||||
|
# Loop from 1 to the value in weapon_number
|
||||||
|
for page in range(1, id_number+1):
|
||||||
|
|
||||||
|
browser.visit(url + str(page))
|
||||||
|
html = browser.html
|
||||||
|
soup = bs(html, 'html.parser')
|
||||||
|
|
||||||
|
# Select only the content section
|
||||||
|
content = soup.find(id='ctl00_MainContent_DetailedOutput')
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Store the name and description
|
||||||
|
name = content.find('h1', class_='title')
|
||||||
|
name.span.decompose()
|
||||||
|
name = name.text
|
||||||
|
|
||||||
|
except:
|
||||||
|
name = f'name: {page}'
|
||||||
|
|
||||||
|
try:
|
||||||
|
description = ''
|
||||||
|
start = content.find('hr')
|
||||||
|
for e in start.next_siblings:
|
||||||
|
if isinstance(e, Tag):
|
||||||
|
description = description + e.text.strip()
|
||||||
|
elif isinstance(e, NavigableString):
|
||||||
|
description = description + e
|
||||||
|
|
||||||
|
except:
|
||||||
|
description = f'name: {page}'
|
||||||
|
|
||||||
|
print(f'{page} of {id_number} | {name}')
|
||||||
|
|
||||||
|
# Append values to our empty lists
|
||||||
|
name_list.append(name)
|
||||||
|
description_list.append(description)
|
||||||
|
|
||||||
|
print(f'------------------------')
|
||||||
|
print(f'Data Retrieval Complete')
|
||||||
|
|
||||||
|
# Create df with the scraped data
|
||||||
|
data = {'Name': name_list, 'description': description_list}
|
||||||
|
|
||||||
|
# Returns a data frame
|
||||||
|
return pd.DataFrame(data)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# scrape the descriptions
|
||||||
|
url_gear = 'https://2e.aonprd.com/Equipment.aspx?ID='
|
||||||
|
number_gear = 65 #65 to scrape
|
||||||
|
|
||||||
|
gear_description = scrape_description(url_gear, number_gear)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx
|
||||||
|
gear = pd.read_csv('gear.csv')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
gear_description
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
gear = gear.merge(gear_description, how='left', on='Name')
|
||||||
|
gear['Level'].fillna(0, inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
gear.to_csv('gear.csv')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue