From 6ad5787a9d7e523a58f9f0e7e6bafd4c3a2f458d Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Fri, 16 Aug 2019 18:09:02 -0400 Subject: [PATCH] Create scrape_background.ipynb --- src/backgrounds/scrape_background.ipynb | 144 ++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 src/backgrounds/scrape_background.ipynb diff --git a/src/backgrounds/scrape_background.ipynb b/src/backgrounds/scrape_background.ipynb new file mode 100644 index 0000000..22c032a --- /dev/null +++ b/src/backgrounds/scrape_background.ipynb @@ -0,0 +1,144 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scrape data from aon2e and generate csvs to import in to sqlite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dependencies\n", + "import pandas as pd\n", + "from bs4 import BeautifulSoup as bs, Tag, NavigableString\n", + "import os\n", + "from splinter import Browser\n", + "\n", + "# Setting up Selenium\n", + "chrome_driver = os.path.join('..', 'chromedriver.exe')\n", + "executable_path = {'executable_path': chrome_driver}\n", + "browser = Browser('chrome', **executable_path, headless=False)\n", + "\n", + "# Pandas config\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def scrape_description(url, id_number):\n", + "\n", + " # Empty lists to store the scraped values\n", + " name_list = []\n", + " description_list = []\n", + "\n", + " print(f'Beginning Data Retrieval')\n", + " print(f'------------------------')\n", + "\n", + " # Loop from 1 to the value in weapon_number\n", + " for page in range(1, id_number+1):\n", + "\n", + " browser.visit(url + str(page))\n", + " html = browser.html\n", + " soup = bs(html, 'html.parser')\n", + "\n", + " # Select only the content section\n", + " content = soup.find(id='ctl00_MainContent_DetailedOutput')\n", + "\n", + " try:\n", + " # Store the name and description\n", + " name = content.find('h1', class_='title')\n", + " name.span.decompose()\n", + " name = name.text\n", + "\n", + " except:\n", + " name = f'name: {page}'\n", + "\n", + " try:\n", + " description = ''\n", + " start = content.find('hr')\n", + " for e in start.next_siblings:\n", + " if isinstance(e, Tag):\n", + " description = description + e.text.strip()\n", + " elif isinstance(e, NavigableString):\n", + " description = description + e\n", + "\n", + " except:\n", + " description = f'name: {page}'\n", + "\n", + " print(f'{page} of {id_number} | {name}')\n", + "\n", + " # Append values to our empty lists\n", + " name_list.append(name) \n", + " description_list.append(description)\n", + "\n", + " print(f'------------------------')\n", + " print(f'Data Retrieval Complete')\n", + " \n", + " # Create df with the scraped data\n", + " data = {'Name': name_list, 'description': description_list}\n", + " \n", + " # Returns a data frame\n", + " return pd.DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# scrape the descriptions\n", + "url_background = 'https://2e.aonprd.com/Equipment.aspx?ID='\n", + "number_background = 65 #65 to scrape\n", + "\n", + "description_background = scrape_description(url_gear, number_gear)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gear.to_csv('background.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}