Final executable

2019-08-09 20:51:07 -04:00 · 2019-08-09 20:51:07 -04:00 · bb5d781aaa
parent dfbf07e6d9
commit bb5d781aaa
3 changed files with 296 additions and 162 deletions
--- a/src/gear/scrape.ipynb
+++ b/src/gear/scrape.ipynb
@ -1,162 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scrape data from aon2e and generate csvs to import in to sqlite"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dependencies\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup as bs\n",
    "import os\n",
    "from splinter import Browser\n",
    "\n",
    "# Setting up Selenium\n",
    "chrome_driver = os.path.join('..', 'chromedriver.exe')\n",
    "executable_path = {'executable_path': chrome_driver}\n",
    "browser = Browser('chrome', **executable_path, headless=False)\n",
    "\n",
    "# Pandas config\n",
    "pd.set_option('display.max_columns', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# url that contains all the links\n",
    "url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID='\n",
    "\n",
    "# Number of weapons\n",
    "number_of_weapons = 83\n",
    "\n",
    "# Empty lists to store the scraped values\n",
    "name_list = []\n",
    "description_list = []\n",
    "\n",
    "print(f'Beginning Data Retrieval')\n",
    "print(f'------------------------')\n",
    "\n",
    "# Loop from 1 to the value in weapon_number\n",
    "for weapon in range(1, number_of_weapons+1):\n",
    "    \n",
    "    url = url_weapon + str(weapon)\n",
    "    browser.visit(url)\n",
    "    html = browser.html\n",
    "    soup = bs(html, 'html.parser')\n",
    "\n",
    "    # Select only the content section\n",
    "    content = soup.find(id='ctl00_MainContent_DetailedOutput')\n",
    "\n",
    "    try:\n",
    "        # Store the name and description\n",
    "        name = content.find('a').text.strip()\n",
    "        \n",
    "    except:\n",
    "        name = f'weapon: {weapon}'\n",
    "\n",
    "    try:\n",
    "        description = content.find('hr').next.text.strip()\n",
    "        \n",
    "    except:\n",
    "        description = content.find('hr').next.strip()\n",
    "        \n",
    "    print(f'Processing Weapon {weapon} of {number_of_weapons} | {name}')\n",
    "\n",
    "    # Append values to our empty lists\n",
    "    name_list.append(name)      \n",
    "    description_list.append(description)\n",
    "\n",
    "print(f'------------------------')\n",
    "print(f'Data Retrieval Complete')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx\n",
    "melee = pd.read_csv('melee.csv')\n",
    "ranged = pd.read_csv('ranged.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {'Name': name_list, 'description': description_list}\n",
    "scrape = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "melee = melee.merge(scrape, how='left', on='Name')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ranged = ranged.merge(scrape, how='left', on='Name')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "melee.to_csv('melee.csv')\n",
    "ranged.to_csv('ranged.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/src/gear/scrape_gear.ipynb
+++ b/src/gear/scrape_gear.ipynb
@ -0,0 +1,173 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scrape data from aon2e and generate csvs to import in to sqlite"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dependencies\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup as bs, Tag, NavigableString\n",
    "import os\n",
    "from splinter import Browser\n",
    "\n",
    "# Setting up Selenium\n",
    "chrome_driver = os.path.join('..', 'chromedriver.exe')\n",
    "executable_path = {'executable_path': chrome_driver}\n",
    "browser = Browser('chrome', **executable_path, headless=False)\n",
    "\n",
    "# Pandas config\n",
    "pd.set_option('display.max_columns', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def scrape_description(url, id_number):\n",
    "\n",
    "    # Empty lists to store the scraped values\n",
    "    name_list = []\n",
    "    description_list = []\n",
    "\n",
    "    print(f'Beginning Data Retrieval')\n",
    "    print(f'------------------------')\n",
    "\n",
    "    # Loop from 1 to the value in weapon_number\n",
    "    for page in range(1, id_number+1):\n",
    "\n",
    "        browser.visit(url + str(page))\n",
    "        html = browser.html\n",
    "        soup = bs(html, 'html.parser')\n",
    "\n",
    "        # Select only the content section\n",
    "        content = soup.find(id='ctl00_MainContent_DetailedOutput')\n",
    "\n",
    "        try:\n",
    "            # Store the name and description\n",
    "            name = content.find('h1', class_='title')\n",
    "            name.span.decompose()\n",
    "            name = name.text\n",
    "\n",
    "        except:\n",
    "            name = f'name: {page}'\n",
    "\n",
    "        try:\n",
    "            description = ''\n",
    "            start = content.find('hr')\n",
    "            for e in start.next_siblings:\n",
    "                if isinstance(e, Tag):\n",
    "                    description = description + e.text.strip()\n",
    "                elif isinstance(e, NavigableString):\n",
    "                    description = description + e\n",
    "\n",
    "        except:\n",
    "            description = f'name: {page}'\n",
    "\n",
    "        print(f'{page} of {id_number} | {name}')\n",
    "\n",
    "        # Append values to our empty lists\n",
    "        name_list.append(name)      \n",
    "        description_list.append(description)\n",
    "\n",
    "    print(f'------------------------')\n",
    "    print(f'Data Retrieval Complete')\n",
    "    \n",
    "    # Create df with the scraped data\n",
    "    data = {'Name': name_list, 'description': description_list}\n",
    "    \n",
    "    # Returns a data frame\n",
    "    return pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# scrape the descriptions\n",
    "url_gear = 'https://2e.aonprd.com/Equipment.aspx?ID='\n",
    "number_gear = 65 #65 to scrape\n",
    "\n",
    "gear_description = scrape_description(url_gear, number_gear)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx\n",
    "gear = pd.read_csv('gear.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gear_description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gear = gear.merge(gear_description, how='left', on='Name')\n",
    "gear['Level'].fillna(0, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gear.to_csv('gear.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/src/gear/scrape_gear.py
+++ b/src/gear/scrape_gear.py
@ -0,0 +1,123 @@
 #!/usr/bin/env python
 # coding: utf-8
 # # Scrape data from aon2e and generate csvs to import in to sqlite
 # In[ ]:
 # Dependencies
 import pandas as pd
 from bs4 import BeautifulSoup as bs, Tag, NavigableString
 import os
 from splinter import Browser
 # Setting up Selenium
 chrome_driver = os.path.join('..', 'chromedriver.exe')
 executable_path = {'executable_path': chrome_driver}
 browser = Browser('chrome', **executable_path, headless=False)
 # Pandas config
 pd.set_option('display.max_columns', None)
 # In[ ]:
 def scrape_description(url, id_number):
    # Empty lists to store the scraped values
    name_list = []
    description_list = []
    print(f'Beginning Data Retrieval')
    print(f'------------------------')
    # Loop from 1 to the value in weapon_number
    for page in range(1, id_number+1):
        browser.visit(url + str(page))
        html = browser.html
        soup = bs(html, 'html.parser')
        # Select only the content section
        content = soup.find(id='ctl00_MainContent_DetailedOutput')
        try:
            # Store the name and description
            name = content.find('h1', class_='title')
            name.span.decompose()
            name = name.text
        except:
            name = f'name: {page}'
        try:
            description = ''
            start = content.find('hr')
            for e in start.next_siblings:
                if isinstance(e, Tag):
                    description = description + e.text.strip()
                elif isinstance(e, NavigableString):
                    description = description + e
        except:
            description = f'name: {page}'
        print(f'{page} of {id_number} | {name}')
        # Append values to our empty lists
        name_list.append(name)      
        description_list.append(description)
    print(f'------------------------')
    print(f'Data Retrieval Complete')
    # Create df with the scraped data
    data = {'Name': name_list, 'description': description_list}
    # Returns a data frame
    return pd.DataFrame(data)
 # In[ ]:
 # scrape the descriptions
 url_gear = 'https://2e.aonprd.com/Equipment.aspx?ID='
 number_gear = 65 #65 to scrape
 gear_description = scrape_description(url_gear, number_gear)
 # In[ ]:
 # Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx
 gear = pd.read_csv('gear.csv')
 # In[ ]:
 gear_description
 # In[ ]:
 gear = gear.merge(gear_description, how='left', on='Name')
 gear['Level'].fillna(0, inplace=True)
 # In[ ]:
 gear.to_csv('gear.csv')
 # In[ ]: