From 9bea77e449732107161c6ff25814fe886d3de457 Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Wed, 7 Aug 2019 16:41:55 -0400 Subject: [PATCH] Create scrape.ipynb --- scrape/scrape.ipynb | 448 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 448 insertions(+) create mode 100644 scrape/scrape.ipynb diff --git a/scrape/scrape.ipynb b/scrape/scrape.ipynb new file mode 100644 index 0000000..88cecf8 --- /dev/null +++ b/scrape/scrape.ipynb @@ -0,0 +1,448 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scrape data from aon2e and generate csvs to import in to sqlite" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [], + "source": [ + "# Dependencies\n", + "import pandas as pd\n", + "from bs4 import BeautifulSoup as bs\n", + "import requests\n", + "import time\n", + "import re\n", + "\n", + "# Pandas config\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ancestries TODO" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# url that contains all the links\n", + "url_ancestry = 'https://2e.aonprd.com/Ancestries.aspx?ID='\n", + "\n", + "# Empty list to store the ancestry data\n", + "ancestry = []\n", + "\n", + "# Make the request to the aon2e\n", + "response_ancestry = requests.get(f'{url_ancestry}1')\n", + "\n", + "# Use BS4 html parser to generate soup\n", + "soup_ancestry = bs(response_ancestry.text, 'html.parser')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Store the data needed from the soup\n", + "name = soup_ancestry.find(id='ctl00_MainContent_DetailedOutput').a.text\n", + "traits = [trait.a.text for trait in soup_ancestry.find_all(class_='trait')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Raw description\n", + "description = soup_ancestry.find(id='ctl00_MainContent_DetailedOutput').text" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Animal Companions TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Animals (Rentals/Sales) TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Arcane Schools TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Arcane Thesis TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Archetypes TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Armor TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Backgrounds TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bloodlines TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Champion Causes TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Champion Tenets TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classes TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Class Kits TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Class Sample Builds TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conditions TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deities TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Doctrines TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Domains TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Druidic Orders TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Equipment TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Familiar Abilities TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feats TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hazards TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hunter's Edges TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instincts TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Languages TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Muses TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rackets TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Research Fields TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rituals TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rules TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Shields TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Skills TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Skills (General) TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spells TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Traits TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Weapons TODO" + ] + }, + { + "cell_type": "code", + "execution_count": 306, + "metadata": {}, + "outputs": [], + "source": [ + "# url that contains all the links\n", + "url_spells = 'https://2e.aonprd.com/Spells.aspx?ID='\n", + "\n", + "# Number of spells taken from https://2e.aonprd.com/Sources.aspx?ID=1\n", + "spell_number = 343\n", + "\n", + "# Make the request to the aon2e\n", + "response_spells = requests.get(f'{url_spells}{spell_number}')\n", + "\n", + "# Use BS4 html parser to generate soup\n", + "soup_spells = bs(response_spells.text, 'html.parser')\n", + "\n", + "# Select only the content\n", + "content = soup_spells.find(id='ctl00_MainContent_DetailedOutput')" + ] + }, + { + "cell_type": "code", + "execution_count": 323, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['1 minute']\n" + ] + } + ], + "source": [ + "name, level = content.h1.text.replace(' ', '').split('Spell')\n", + "traits = [trait.a.text.strip() for trait in content.find_all(class_='trait')]\n", + "source = content.find(class_='external-link').text.strip()\n", + "traditions = [tradition.text.strip() for tradition\n", + " in content.find_all('a', href=re.compile(\"Tradition\"))]\n", + "\n", + "## Actions sections\n", + "actions = []\n", + "# Start at cast and then iterate over the next elements on the line\n", + "for e in content.find('b', text='Cast').next_siblings:\n", + " if e.name == 'br':\n", + " # If the end of the line is reached break the loop\n", + " break\n", + " elif e.name == 'hr':\n", + " # If a horizontal line is reached\n", + " break\n", + " try:\n", + " if e['alt']=='Single Action' and 'actiondark' in e['class']:\n", + " # If it's the single action icon\n", + " actions.append(1)\n", + " elif e['alt']=='Three Actions' and 'actiondark' in e['class']:\n", + " # If it is the three action icon\n", + " actions.append(3)\n", + " except TypeError:\n", + " # If there is no icon handle it as a string\n", + " if 'to' in e:\n", + " # If to exists it must be 1 to 3 actions\n", + " actions.append(2)\n", + " else:\n", + " actions.append(e.split(' (')[0].strip())\n", + " continue\n", + " except KeyError:\n", + " continue\n", + " \n", + "components = content.find('b', text='Cast').next.next \\\n", + " .replace('(', '').replace(')', ''). replace(',', '').split(' ')\n", + "components = [x for x in components if x\n", + " in ['material', 'somatic', 'verbal']]\n", + "\n", + "try:\n", + " # If the spell has a range\n", + " spell_range = content.find('b', text='Range').next.next.replace(';', '')\n", + "except AttributeError:\n", + " # If the spell doesn't have a range\n", + " spell_range = None\n", + "\n", + "target = content.find('hr').previous.strip()\n", + "\n", + "# save TODO\n", + "\n", + "# duration TODO\n", + "\n", + "## Description section\n", + "description = content.find('hr').next\n", + "print(actions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}