Create scrape_background.ipynb

2019-08-16 18:09:02 -04:00 · 2019-08-16 18:09:02 -04:00 · 6ad5787a9d
parent fb7e89e5a4
commit 6ad5787a9d
1 changed files with 144 additions and 0 deletions
--- a/src/backgrounds/scrape_background.ipynb
+++ b/src/backgrounds/scrape_background.ipynb
@ -0,0 +1,144 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scrape data from aon2e and generate csvs to import in to sqlite"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dependencies\n",
+    "import pandas as pd\n",
+    "from bs4 import BeautifulSoup as bs, Tag, NavigableString\n",
+    "import os\n",
+    "from splinter import Browser\n",
+    "\n",
+    "# Setting up Selenium\n",
+    "chrome_driver = os.path.join('..', 'chromedriver.exe')\n",
+    "executable_path = {'executable_path': chrome_driver}\n",
+    "browser = Browser('chrome', **executable_path, headless=False)\n",
+    "\n",
+    "# Pandas config\n",
+    "pd.set_option('display.max_columns', None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def scrape_description(url, id_number):\n",
+    "\n",
+    "    # Empty lists to store the scraped values\n",
+    "    name_list = []\n",
+    "    description_list = []\n",
+    "\n",
+    "    print(f'Beginning Data Retrieval')\n",
+    "    print(f'------------------------')\n",
+    "\n",
+    "    # Loop from 1 to the value in weapon_number\n",
+    "    for page in range(1, id_number+1):\n",
+    "\n",
+    "        browser.visit(url + str(page))\n",
+    "        html = browser.html\n",
+    "        soup = bs(html, 'html.parser')\n",
+    "\n",
+    "        # Select only the content section\n",
+    "        content = soup.find(id='ctl00_MainContent_DetailedOutput')\n",
+    "\n",
+    "        try:\n",
+    "            # Store the name and description\n",
+    "            name = content.find('h1', class_='title')\n",
+    "            name.span.decompose()\n",
+    "            name = name.text\n",
+    "\n",
+    "        except:\n",
+    "            name = f'name: {page}'\n",
+    "\n",
+    "        try:\n",
+    "            description = ''\n",
+    "            start = content.find('hr')\n",
+    "            for e in start.next_siblings:\n",
+    "                if isinstance(e, Tag):\n",
+    "                    description = description + e.text.strip()\n",
+    "                elif isinstance(e, NavigableString):\n",
+    "                    description = description + e\n",
+    "\n",
+    "        except:\n",
+    "            description = f'name: {page}'\n",
+    "\n",
+    "        print(f'{page} of {id_number} | {name}')\n",
+    "\n",
+    "        # Append values to our empty lists\n",
+    "        name_list.append(name)      \n",
+    "        description_list.append(description)\n",
+    "\n",
+    "    print(f'------------------------')\n",
+    "    print(f'Data Retrieval Complete')\n",
+    "    \n",
+    "    # Create df with the scraped data\n",
+    "    data = {'Name': name_list, 'description': description_list}\n",
+    "    \n",
+    "    # Returns a data frame\n",
+    "    return pd.DataFrame(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# scrape the descriptions\n",
+    "url_background = 'https://2e.aonprd.com/Equipment.aspx?ID='\n",
+    "number_background = 65 #65 to scrape\n",
+    "\n",
+    "description_background = scrape_description(url_gear, number_gear)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gear.to_csv('background.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}