diff --git a/src/gear/scrape.ipynb b/src/gear/scrape.ipynb deleted file mode 100644 index 2ca34e0..0000000 --- a/src/gear/scrape.ipynb +++ /dev/null @@ -1,162 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Scrape data from aon2e and generate csvs to import in to sqlite" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Dependencies\n", - "import pandas as pd\n", - "from bs4 import BeautifulSoup as bs\n", - "import os\n", - "from splinter import Browser\n", - "\n", - "# Setting up Selenium\n", - "chrome_driver = os.path.join('..', 'chromedriver.exe')\n", - "executable_path = {'executable_path': chrome_driver}\n", - "browser = Browser('chrome', **executable_path, headless=False)\n", - "\n", - "# Pandas config\n", - "pd.set_option('display.max_columns', None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# url that contains all the links\n", - "url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID='\n", - "\n", - "# Number of weapons\n", - "number_of_weapons = 83\n", - "\n", - "# Empty lists to store the scraped values\n", - "name_list = []\n", - "description_list = []\n", - "\n", - "print(f'Beginning Data Retrieval')\n", - "print(f'------------------------')\n", - "\n", - "# Loop from 1 to the value in weapon_number\n", - "for weapon in range(1, number_of_weapons+1):\n", - " \n", - " url = url_weapon + str(weapon)\n", - " browser.visit(url)\n", - " html = browser.html\n", - " soup = bs(html, 'html.parser')\n", - "\n", - " # Select only the content section\n", - " content = soup.find(id='ctl00_MainContent_DetailedOutput')\n", - "\n", - " try:\n", - " # Store the name and description\n", - " name = content.find('a').text.strip()\n", - " \n", - " except:\n", - " name = f'weapon: {weapon}'\n", - "\n", - " try:\n", - " description = content.find('hr').next.text.strip()\n", - " \n", - " except:\n", - " description = content.find('hr').next.strip()\n", - " \n", - " print(f'Processing Weapon {weapon} of {number_of_weapons} | {name}')\n", - "\n", - " # Append values to our empty lists\n", - " name_list.append(name) \n", - " description_list.append(description)\n", - "\n", - "print(f'------------------------')\n", - "print(f'Data Retrieval Complete')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx\n", - "melee = pd.read_csv('melee.csv')\n", - "ranged = pd.read_csv('ranged.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = {'Name': name_list, 'description': description_list}\n", - "scrape = pd.DataFrame(data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "melee = melee.merge(scrape, how='left', on='Name')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ranged = ranged.merge(scrape, how='left', on='Name')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "melee.to_csv('melee.csv')\n", - "ranged.to_csv('ranged.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/src/gear/scrape_gear.ipynb b/src/gear/scrape_gear.ipynb new file mode 100644 index 0000000..92548d9 --- /dev/null +++ b/src/gear/scrape_gear.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scrape data from aon2e and generate csvs to import in to sqlite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dependencies\n", + "import pandas as pd\n", + "from bs4 import BeautifulSoup as bs, Tag, NavigableString\n", + "import os\n", + "from splinter import Browser\n", + "\n", + "# Setting up Selenium\n", + "chrome_driver = os.path.join('..', 'chromedriver.exe')\n", + "executable_path = {'executable_path': chrome_driver}\n", + "browser = Browser('chrome', **executable_path, headless=False)\n", + "\n", + "# Pandas config\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def scrape_description(url, id_number):\n", + "\n", + " # Empty lists to store the scraped values\n", + " name_list = []\n", + " description_list = []\n", + "\n", + " print(f'Beginning Data Retrieval')\n", + " print(f'------------------------')\n", + "\n", + " # Loop from 1 to the value in weapon_number\n", + " for page in range(1, id_number+1):\n", + "\n", + " browser.visit(url + str(page))\n", + " html = browser.html\n", + " soup = bs(html, 'html.parser')\n", + "\n", + " # Select only the content section\n", + " content = soup.find(id='ctl00_MainContent_DetailedOutput')\n", + "\n", + " try:\n", + " # Store the name and description\n", + " name = content.find('h1', class_='title')\n", + " name.span.decompose()\n", + " name = name.text\n", + "\n", + " except:\n", + " name = f'name: {page}'\n", + "\n", + " try:\n", + " description = ''\n", + " start = content.find('hr')\n", + " for e in start.next_siblings:\n", + " if isinstance(e, Tag):\n", + " description = description + e.text.strip()\n", + " elif isinstance(e, NavigableString):\n", + " description = description + e\n", + "\n", + " except:\n", + " description = f'name: {page}'\n", + "\n", + " print(f'{page} of {id_number} | {name}')\n", + "\n", + " # Append values to our empty lists\n", + " name_list.append(name) \n", + " description_list.append(description)\n", + "\n", + " print(f'------------------------')\n", + " print(f'Data Retrieval Complete')\n", + " \n", + " # Create df with the scraped data\n", + " data = {'Name': name_list, 'description': description_list}\n", + " \n", + " # Returns a data frame\n", + " return pd.DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# scrape the descriptions\n", + "url_gear = 'https://2e.aonprd.com/Equipment.aspx?ID='\n", + "number_gear = 65 #65 to scrape\n", + "\n", + "gear_description = scrape_description(url_gear, number_gear)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx\n", + "gear = pd.read_csv('gear.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gear_description" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gear = gear.merge(gear_description, how='left', on='Name')\n", + "gear['Level'].fillna(0, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gear.to_csv('gear.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/gear/scrape_gear.py b/src/gear/scrape_gear.py new file mode 100644 index 0000000..93db1e6 --- /dev/null +++ b/src/gear/scrape_gear.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Scrape data from aon2e and generate csvs to import in to sqlite + +# In[ ]: + + +# Dependencies +import pandas as pd +from bs4 import BeautifulSoup as bs, Tag, NavigableString +import os +from splinter import Browser + +# Setting up Selenium +chrome_driver = os.path.join('..', 'chromedriver.exe') +executable_path = {'executable_path': chrome_driver} +browser = Browser('chrome', **executable_path, headless=False) + +# Pandas config +pd.set_option('display.max_columns', None) + + +# In[ ]: + + +def scrape_description(url, id_number): + + # Empty lists to store the scraped values + name_list = [] + description_list = [] + + print(f'Beginning Data Retrieval') + print(f'------------------------') + + # Loop from 1 to the value in weapon_number + for page in range(1, id_number+1): + + browser.visit(url + str(page)) + html = browser.html + soup = bs(html, 'html.parser') + + # Select only the content section + content = soup.find(id='ctl00_MainContent_DetailedOutput') + + try: + # Store the name and description + name = content.find('h1', class_='title') + name.span.decompose() + name = name.text + + except: + name = f'name: {page}' + + try: + description = '' + start = content.find('hr') + for e in start.next_siblings: + if isinstance(e, Tag): + description = description + e.text.strip() + elif isinstance(e, NavigableString): + description = description + e + + except: + description = f'name: {page}' + + print(f'{page} of {id_number} | {name}') + + # Append values to our empty lists + name_list.append(name) + description_list.append(description) + + print(f'------------------------') + print(f'Data Retrieval Complete') + + # Create df with the scraped data + data = {'Name': name_list, 'description': description_list} + + # Returns a data frame + return pd.DataFrame(data) + + +# In[ ]: + + +# scrape the descriptions +url_gear = 'https://2e.aonprd.com/Equipment.aspx?ID=' +number_gear = 65 #65 to scrape + +gear_description = scrape_description(url_gear, number_gear) + + +# In[ ]: + + +# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx +gear = pd.read_csv('gear.csv') + + +# In[ ]: + + +gear_description + + +# In[ ]: + + +gear = gear.merge(gear_description, how='left', on='Name') +gear['Level'].fillna(0, inplace=True) + + +# In[ ]: + + +gear.to_csv('gear.csv') + + +# In[ ]: + + + +