{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Scrape data from aon2e and generate csvs to import in to sqlite" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Dependencies\n", "import pandas as pd\n", "from bs4 import BeautifulSoup as bs, Tag, NavigableString\n", "import os\n", "from splinter import Browser\n", "\n", "# Setting up Selenium\n", "chrome_driver = os.path.join('..', 'chromedriver.exe')\n", "executable_path = {'executable_path': chrome_driver}\n", "browser = Browser('chrome', **executable_path, headless=False)\n", "\n", "# Pandas config\n", "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def scrape_description(url, id_number):\n", "\n", " # Empty lists to store the scraped values\n", " name_list = []\n", " description_list = []\n", "\n", " print(f'Beginning Data Retrieval')\n", " print(f'------------------------')\n", "\n", " # Loop from 1 to the value in weapon_number\n", " for page in range(1, id_number+1):\n", "\n", " browser.visit(url + str(page))\n", " html = browser.html\n", " soup = bs(html, 'html.parser')\n", "\n", " # Select only the content section\n", " content = soup.find(id='ctl00_MainContent_DetailedOutput')\n", "\n", " try:\n", " # Store the name\n", " name = content.find('h1', class_='title').a.text.strip()\n", " name\n", "\n", " except:\n", " name = f'name: {page}'\n", "\n", " try:\n", " # Start the loop after the link to the book\n", " start = content.find('a', class_='external-link').next_sibling\n", " description = ''\n", " for e in start.next_siblings:\n", " if isinstance(e, Tag):\n", " if e.name == 'br':\n", " if e.next_sibling.name == 'br':\n", " # If the next 2 elements are br skip this\n", " # loop it will be handled in the elif\n", " continue\n", " elif e.previous_sibling.name == 'br':\n", " # If this element and the previous are br\n", " # and the next is not append /n\n", " description = description + ' /n/n'\n", " else:\n", " # If there is just one br append /n\n", " description = description + ' /n'\n", " else:\n", " # Append the text inside the element\n", " description = description + e.text.strip()\n", " elif isinstance(e, NavigableString):\n", " # Since it is just a text append it\n", " description = description + e\n", "\n", " except:\n", " description = f'name: {page}'\n", "\n", " print(f'{page} of {id_number} | {name}')\n", "\n", " # Append values to our empty lists\n", " name_list.append(name) \n", " description_list.append(description)\n", "\n", " print(f'------------------------')\n", " print(f'Data Retrieval Complete')\n", " \n", " # Create df with the scraped data\n", " data = {'Name': name_list, 'description': description_list}\n", " \n", " # Returns a data frame\n", " return pd.DataFrame(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# scrape the descriptions\n", "url_background = 'https://2e.aonprd.com/Backgrounds.aspx?ID='\n", "number_background = 50 # number to scrape\n", "\n", "description_background = scrape_description(url_background, number_background)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "description_background.to_csv('background.csv', encoding='UTF-8', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }