Create scrape_background.ipynb

merge-requests/34/head
Brian Haley 2019-08-16 18:09:02 -04:00
parent fb7e89e5a4
commit 6ad5787a9d
1 changed files with 144 additions and 0 deletions

View File

@ -0,0 +1,144 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Scrape data from aon2e and generate csvs to import in to sqlite"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Dependencies\n",
"import pandas as pd\n",
"from bs4 import BeautifulSoup as bs, Tag, NavigableString\n",
"import os\n",
"from splinter import Browser\n",
"\n",
"# Setting up Selenium\n",
"chrome_driver = os.path.join('..', 'chromedriver.exe')\n",
"executable_path = {'executable_path': chrome_driver}\n",
"browser = Browser('chrome', **executable_path, headless=False)\n",
"\n",
"# Pandas config\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def scrape_description(url, id_number):\n",
"\n",
" # Empty lists to store the scraped values\n",
" name_list = []\n",
" description_list = []\n",
"\n",
" print(f'Beginning Data Retrieval')\n",
" print(f'------------------------')\n",
"\n",
" # Loop from 1 to the value in weapon_number\n",
" for page in range(1, id_number+1):\n",
"\n",
" browser.visit(url + str(page))\n",
" html = browser.html\n",
" soup = bs(html, 'html.parser')\n",
"\n",
" # Select only the content section\n",
" content = soup.find(id='ctl00_MainContent_DetailedOutput')\n",
"\n",
" try:\n",
" # Store the name and description\n",
" name = content.find('h1', class_='title')\n",
" name.span.decompose()\n",
" name = name.text\n",
"\n",
" except:\n",
" name = f'name: {page}'\n",
"\n",
" try:\n",
" description = ''\n",
" start = content.find('hr')\n",
" for e in start.next_siblings:\n",
" if isinstance(e, Tag):\n",
" description = description + e.text.strip()\n",
" elif isinstance(e, NavigableString):\n",
" description = description + e\n",
"\n",
" except:\n",
" description = f'name: {page}'\n",
"\n",
" print(f'{page} of {id_number} | {name}')\n",
"\n",
" # Append values to our empty lists\n",
" name_list.append(name) \n",
" description_list.append(description)\n",
"\n",
" print(f'------------------------')\n",
" print(f'Data Retrieval Complete')\n",
" \n",
" # Create df with the scraped data\n",
" data = {'Name': name_list, 'description': description_list}\n",
" \n",
" # Returns a data frame\n",
" return pd.DataFrame(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# scrape the descriptions\n",
"url_background = 'https://2e.aonprd.com/Equipment.aspx?ID='\n",
"number_background = 65 #65 to scrape\n",
"\n",
"description_background = scrape_description(url_gear, number_gear)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"gear.to_csv('background.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}