{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Scrape data from aon2e and generate csvs to import in to sqlite" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Dependencies\n", "import pandas as pd\n", "from bs4 import BeautifulSoup as bs\n", "import os\n", "from splinter import Browser\n", "\n", "# Setting up Selenium\n", "chrome_driver = os.path.join('..', 'chromedriver.exe')\n", "executable_path = {'executable_path': chrome_driver}\n", "browser = Browser('chrome', **executable_path, headless=False)\n", "\n", "# Pandas config\n", "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# url that contains all the links\n", "url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID='\n", "\n", "# Number of weapons\n", "number_of_weapons = 83\n", "\n", "# Empty lists to store the scraped values\n", "name_list = []\n", "description_list = []\n", "\n", "print(f'Beginning Data Retrieval')\n", "print(f'------------------------')\n", "\n", "# Loop from 1 to the value in weapon_number\n", "for weapon in range(1, number_of_weapons+1):\n", " \n", " url = url_weapon + str(weapon)\n", " browser.visit(url)\n", " html = browser.html\n", " soup = bs(html, 'html.parser')\n", "\n", " # Select only the content section\n", " content = soup.find(id='ctl00_MainContent_DetailedOutput')\n", "\n", " try:\n", " # Store the name and description\n", " name = content.find('a').text.strip()\n", " \n", " except:\n", " name = f'weapon: {weapon}'\n", "\n", " try:\n", " description = content.find('hr').next.text.strip()\n", " \n", " except:\n", " description = content.find('hr').next.strip()\n", " \n", " print(f'Processing Weapon {weapon} of {number_of_weapons} | {name}')\n", "\n", " # Append values to our empty lists\n", " name_list.append(name) \n", " description_list.append(description)\n", "\n", "print(f'------------------------')\n", "print(f'Data Retrieval Complete')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx\n", "melee = pd.read_csv('melee.csv')\n", "ranged = pd.read_csv('ranged.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = {'Name': name_list, 'description': description_list}\n", "scrape = pd.DataFrame(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "melee = melee.merge(scrape, how='left', on='Name')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ranged = ranged.merge(scrape, how='left', on='Name')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "melee.to_csv('melee.csv')\n", "ranged.to_csv('ranged.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }