Final executable

2019-08-09 20:51:07 -04:00 · 2019-08-09 20:51:07 -04:00 · bb5d781aaa
parent dfbf07e6d9
commit bb5d781aaa
3 changed files with 296 additions and 162 deletions
--- a/src/gear/scrape.ipynb
+++ b/src/gear/scrape.ipynb
@ -1,162 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Scrape data from aon2e and generate csvs to import in to sqlite"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Dependencies\n",
-    "import pandas as pd\n",
-    "from bs4 import BeautifulSoup as bs\n",
-    "import os\n",
-    "from splinter import Browser\n",
-    "\n",
-    "# Setting up Selenium\n",
-    "chrome_driver = os.path.join('..', 'chromedriver.exe')\n",
-    "executable_path = {'executable_path': chrome_driver}\n",
-    "browser = Browser('chrome', **executable_path, headless=False)\n",
-    "\n",
-    "# Pandas config\n",
-    "pd.set_option('display.max_columns', None)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# url that contains all the links\n",
-    "url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID='\n",
-    "\n",
-    "# Number of weapons\n",
-    "number_of_weapons = 83\n",
-    "\n",
-    "# Empty lists to store the scraped values\n",
-    "name_list = []\n",
-    "description_list = []\n",
-    "\n",
-    "print(f'Beginning Data Retrieval')\n",
-    "print(f'------------------------')\n",
-    "\n",
-    "# Loop from 1 to the value in weapon_number\n",
-    "for weapon in range(1, number_of_weapons+1):\n",
-    "    \n",
-    "    url = url_weapon + str(weapon)\n",
-    "    browser.visit(url)\n",
-    "    html = browser.html\n",
-    "    soup = bs(html, 'html.parser')\n",
-    "\n",
-    "    # Select only the content section\n",
-    "    content = soup.find(id='ctl00_MainContent_DetailedOutput')\n",
-    "\n",
-    "    try:\n",
-    "        # Store the name and description\n",
-    "        name = content.find('a').text.strip()\n",
-    "        \n",
-    "    except:\n",
-    "        name = f'weapon: {weapon}'\n",
-    "\n",
-    "    try:\n",
-    "        description = content.find('hr').next.text.strip()\n",
-    "        \n",
-    "    except:\n",
-    "        description = content.find('hr').next.strip()\n",
-    "        \n",
-    "    print(f'Processing Weapon {weapon} of {number_of_weapons} | {name}')\n",
-    "\n",
-    "    # Append values to our empty lists\n",
-    "    name_list.append(name)      \n",
-    "    description_list.append(description)\n",
-    "\n",
-    "print(f'------------------------')\n",
-    "print(f'Data Retrieval Complete')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx\n",
-    "melee = pd.read_csv('melee.csv')\n",
-    "ranged = pd.read_csv('ranged.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = {'Name': name_list, 'description': description_list}\n",
-    "scrape = pd.DataFrame(data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "melee = melee.merge(scrape, how='left', on='Name')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ranged = ranged.merge(scrape, how='left', on='Name')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "melee.to_csv('melee.csv')\n",
-    "ranged.to_csv('ranged.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
--- a/src/gear/scrape_gear.ipynb
+++ b/src/gear/scrape_gear.ipynb
@ -0,0 +1,173 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scrape data from aon2e and generate csvs to import in to sqlite"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dependencies\n",
+    "import pandas as pd\n",
+    "from bs4 import BeautifulSoup as bs, Tag, NavigableString\n",
+    "import os\n",
+    "from splinter import Browser\n",
+    "\n",
+    "# Setting up Selenium\n",
+    "chrome_driver = os.path.join('..', 'chromedriver.exe')\n",
+    "executable_path = {'executable_path': chrome_driver}\n",
+    "browser = Browser('chrome', **executable_path, headless=False)\n",
+    "\n",
+    "# Pandas config\n",
+    "pd.set_option('display.max_columns', None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def scrape_description(url, id_number):\n",
+    "\n",
+    "    # Empty lists to store the scraped values\n",
+    "    name_list = []\n",
+    "    description_list = []\n",
+    "\n",
+    "    print(f'Beginning Data Retrieval')\n",
+    "    print(f'------------------------')\n",
+    "\n",
+    "    # Loop from 1 to the value in weapon_number\n",
+    "    for page in range(1, id_number+1):\n",
+    "\n",
+    "        browser.visit(url + str(page))\n",
+    "        html = browser.html\n",
+    "        soup = bs(html, 'html.parser')\n",
+    "\n",
+    "        # Select only the content section\n",
+    "        content = soup.find(id='ctl00_MainContent_DetailedOutput')\n",
+    "\n",
+    "        try:\n",
+    "            # Store the name and description\n",
+    "            name = content.find('h1', class_='title')\n",
+    "            name.span.decompose()\n",
+    "            name = name.text\n",
+    "\n",
+    "        except:\n",
+    "            name = f'name: {page}'\n",
+    "\n",
+    "        try:\n",
+    "            description = ''\n",
+    "            start = content.find('hr')\n",
+    "            for e in start.next_siblings:\n",
+    "                if isinstance(e, Tag):\n",
+    "                    description = description + e.text.strip()\n",
+    "                elif isinstance(e, NavigableString):\n",
+    "                    description = description + e\n",
+    "\n",
+    "        except:\n",
+    "            description = f'name: {page}'\n",
+    "\n",
+    "        print(f'{page} of {id_number} | {name}')\n",
+    "\n",
+    "        # Append values to our empty lists\n",
+    "        name_list.append(name)      \n",
+    "        description_list.append(description)\n",
+    "\n",
+    "    print(f'------------------------')\n",
+    "    print(f'Data Retrieval Complete')\n",
+    "    \n",
+    "    # Create df with the scraped data\n",
+    "    data = {'Name': name_list, 'description': description_list}\n",
+    "    \n",
+    "    # Returns a data frame\n",
+    "    return pd.DataFrame(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# scrape the descriptions\n",
+    "url_gear = 'https://2e.aonprd.com/Equipment.aspx?ID='\n",
+    "number_gear = 65 #65 to scrape\n",
+    "\n",
+    "gear_description = scrape_description(url_gear, number_gear)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx\n",
+    "gear = pd.read_csv('gear.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gear_description"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gear = gear.merge(gear_description, how='left', on='Name')\n",
+    "gear['Level'].fillna(0, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gear.to_csv('gear.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/src/gear/scrape_gear.py
+++ b/src/gear/scrape_gear.py
@ -0,0 +1,123 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# # Scrape data from aon2e and generate csvs to import in to sqlite
+
+# In[ ]:
+
+
+# Dependencies
+import pandas as pd
+from bs4 import BeautifulSoup as bs, Tag, NavigableString
+import os
+from splinter import Browser
+
+# Setting up Selenium
+chrome_driver = os.path.join('..', 'chromedriver.exe')
+executable_path = {'executable_path': chrome_driver}
+browser = Browser('chrome', **executable_path, headless=False)
+
+# Pandas config
+pd.set_option('display.max_columns', None)
+
+
+# In[ ]:
+
+
+def scrape_description(url, id_number):
+
+    # Empty lists to store the scraped values
+    name_list = []
+    description_list = []
+
+    print(f'Beginning Data Retrieval')
+    print(f'------------------------')
+
+    # Loop from 1 to the value in weapon_number
+    for page in range(1, id_number+1):
+
+        browser.visit(url + str(page))
+        html = browser.html
+        soup = bs(html, 'html.parser')
+
+        # Select only the content section
+        content = soup.find(id='ctl00_MainContent_DetailedOutput')
+
+        try:
+            # Store the name and description
+            name = content.find('h1', class_='title')
+            name.span.decompose()
+            name = name.text
+
+        except:
+            name = f'name: {page}'
+
+        try:
+            description = ''
+            start = content.find('hr')
+            for e in start.next_siblings:
+                if isinstance(e, Tag):
+                    description = description + e.text.strip()
+                elif isinstance(e, NavigableString):
+                    description = description + e
+
+        except:
+            description = f'name: {page}'
+
+        print(f'{page} of {id_number} | {name}')
+
+        # Append values to our empty lists
+        name_list.append(name)      
+        description_list.append(description)
+
+    print(f'------------------------')
+    print(f'Data Retrieval Complete')
+    
+    # Create df with the scraped data
+    data = {'Name': name_list, 'description': description_list}
+    
+    # Returns a data frame
+    return pd.DataFrame(data)
+
+
+# In[ ]:
+
+
+# scrape the descriptions
+url_gear = 'https://2e.aonprd.com/Equipment.aspx?ID='
+number_gear = 65 #65 to scrape
+
+gear_description = scrape_description(url_gear, number_gear)
+
+
+# In[ ]:
+
+
+# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx
+gear = pd.read_csv('gear.csv')
+
+
+# In[ ]:
+
+
+gear_description
+
+
+# In[ ]:
+
+
+gear = gear.merge(gear_description, how='left', on='Name')
+gear['Level'].fillna(0, inplace=True)
+
+
+# In[ ]:
+
+
+gear.to_csv('gear.csv')
+
+
+# In[ ]:
+
+
+
+