diff --git a/src/gear/README.md b/src/gear/README.md new file mode 100644 index 0000000..f7f5ea0 --- /dev/null +++ b/src/gear/README.md @@ -0,0 +1,7 @@ +# This directory scrapes the weapons from the [Archives of Nethys](https://2e.aonprd.com/Weapons.aspx) + +## Steps to scrape the weapons +1. Install the requirements from [the previous readme](../README.md) +2. Generate .csv files from copy pasting the tables from [here](https://2e.aonprd.com/Weapons.aspx) and save them in this directory +3. Set the number_of_weapons variable to the number of weapons in the database it is currently 83 +4. Run the [python file](scrape.py) or [Jupyter Notebook](scrape.ipynb) \ No newline at end of file diff --git a/src/gear/scrape.ipynb b/src/gear/scrape.ipynb new file mode 100644 index 0000000..2ca34e0 --- /dev/null +++ b/src/gear/scrape.ipynb @@ -0,0 +1,162 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scrape data from aon2e and generate csvs to import in to sqlite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dependencies\n", + "import pandas as pd\n", + "from bs4 import BeautifulSoup as bs\n", + "import os\n", + "from splinter import Browser\n", + "\n", + "# Setting up Selenium\n", + "chrome_driver = os.path.join('..', 'chromedriver.exe')\n", + "executable_path = {'executable_path': chrome_driver}\n", + "browser = Browser('chrome', **executable_path, headless=False)\n", + "\n", + "# Pandas config\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# url that contains all the links\n", + "url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID='\n", + "\n", + "# Number of weapons\n", + "number_of_weapons = 83\n", + "\n", + "# Empty lists to store the scraped values\n", + "name_list = []\n", + "description_list = []\n", + "\n", + "print(f'Beginning Data Retrieval')\n", + "print(f'------------------------')\n", + "\n", + "# Loop from 1 to the value in weapon_number\n", + "for weapon in range(1, number_of_weapons+1):\n", + " \n", + " url = url_weapon + str(weapon)\n", + " browser.visit(url)\n", + " html = browser.html\n", + " soup = bs(html, 'html.parser')\n", + "\n", + " # Select only the content section\n", + " content = soup.find(id='ctl00_MainContent_DetailedOutput')\n", + "\n", + " try:\n", + " # Store the name and description\n", + " name = content.find('a').text.strip()\n", + " \n", + " except:\n", + " name = f'weapon: {weapon}'\n", + "\n", + " try:\n", + " description = content.find('hr').next.text.strip()\n", + " \n", + " except:\n", + " description = content.find('hr').next.strip()\n", + " \n", + " print(f'Processing Weapon {weapon} of {number_of_weapons} | {name}')\n", + "\n", + " # Append values to our empty lists\n", + " name_list.append(name) \n", + " description_list.append(description)\n", + "\n", + "print(f'------------------------')\n", + "print(f'Data Retrieval Complete')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx\n", + "melee = pd.read_csv('melee.csv')\n", + "ranged = pd.read_csv('ranged.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = {'Name': name_list, 'description': description_list}\n", + "scrape = pd.DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "melee = melee.merge(scrape, how='left', on='Name')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ranged = ranged.merge(scrape, how='left', on='Name')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "melee.to_csv('melee.csv')\n", + "ranged.to_csv('ranged.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}