Create scrape.ipynb
							parent
							
								
									dfefa0151c
								
							
						
					
					
						commit
						8afe1dc662
					
				|  | @ -0,0 +1,448 @@ | |||
| { | ||||
|  "cells": [ | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "# Scrape data from aon2e and generate csvs to import in to sqlite" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 173, | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# Dependencies\n", | ||||
|     "import pandas as pd\n", | ||||
|     "from bs4 import BeautifulSoup as bs\n", | ||||
|     "import requests\n", | ||||
|     "import time\n", | ||||
|     "import re\n", | ||||
|     "\n", | ||||
|     "# Pandas config\n", | ||||
|     "pd.set_option('display.max_columns', None)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Ancestries TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# url that contains all the links\n", | ||||
|     "url_ancestry = 'https://2e.aonprd.com/Ancestries.aspx?ID='\n", | ||||
|     "\n", | ||||
|     "# Empty list to store the ancestry data\n", | ||||
|     "ancestry = []\n", | ||||
|     "\n", | ||||
|     "# Make the request to the aon2e\n", | ||||
|     "response_ancestry = requests.get(f'{url_ancestry}1')\n", | ||||
|     "\n", | ||||
|     "# Use BS4 html parser to generate soup\n", | ||||
|     "soup_ancestry = bs(response_ancestry.text, 'html.parser')" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# Store the data needed from the soup\n", | ||||
|     "name = soup_ancestry.find(id='ctl00_MainContent_DetailedOutput').a.text\n", | ||||
|     "traits = [trait.a.text for trait in soup_ancestry.find_all(class_='trait')]" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# Raw description\n", | ||||
|     "description = soup_ancestry.find(id='ctl00_MainContent_DetailedOutput').text" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Animal Companions TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Animals (Rentals/Sales) TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Arcane Schools TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Arcane Thesis TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Archetypes TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Armor TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Backgrounds TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Bloodlines TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Champion Causes TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Champion Tenets TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Classes TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Class Kits TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Class Sample Builds TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Conditions TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Deities TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Doctrines TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Domains TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Druidic Orders TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Equipment TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Familiar Abilities TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Feats TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Hazards TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Hunter's Edges TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Instincts TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Languages TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Muses TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Rackets TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Research Fields TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Rituals TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Rules TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Shields TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Skills TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Skills (General) TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Spells TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Traits TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "## Weapons TODO" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 306, | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# url that contains all the links\n", | ||||
|     "url_spells = 'https://2e.aonprd.com/Spells.aspx?ID='\n", | ||||
|     "\n", | ||||
|     "# Number of spells taken from https://2e.aonprd.com/Sources.aspx?ID=1\n", | ||||
|     "spell_number = 343\n", | ||||
|     "\n", | ||||
|     "# Make the request to the aon2e\n", | ||||
|     "response_spells = requests.get(f'{url_spells}{spell_number}')\n", | ||||
|     "\n", | ||||
|     "# Use BS4 html parser to generate soup\n", | ||||
|     "soup_spells = bs(response_spells.text, 'html.parser')\n", | ||||
|     "\n", | ||||
|     "# Select only the content\n", | ||||
|     "content = soup_spells.find(id='ctl00_MainContent_DetailedOutput')" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 323, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "['1 minute']\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "name, level = content.h1.text.replace(' ', '').split('Spell')\n", | ||||
|     "traits = [trait.a.text.strip() for trait in content.find_all(class_='trait')]\n", | ||||
|     "source = content.find(class_='external-link').text.strip()\n", | ||||
|     "traditions = [tradition.text.strip() for tradition\n", | ||||
|     "              in content.find_all('a', href=re.compile(\"Tradition\"))]\n", | ||||
|     "\n", | ||||
|     "## Actions sections\n", | ||||
|     "actions = []\n", | ||||
|     "# Start at cast and then iterate over the next elements on the line\n", | ||||
|     "for e in content.find('b', text='Cast').next_siblings:\n", | ||||
|     "    if e.name == 'br':\n", | ||||
|     "        # If the end of the line is reached break the loop\n", | ||||
|     "        break\n", | ||||
|     "    elif e.name == 'hr':\n", | ||||
|     "        # If a horizontal line is reached\n", | ||||
|     "        break\n", | ||||
|     "    try:\n", | ||||
|     "        if e['alt']=='Single Action'  and 'actiondark' in e['class']:\n", | ||||
|     "            # If it's the single action icon\n", | ||||
|     "            actions.append(1)\n", | ||||
|     "        elif e['alt']=='Three Actions' and 'actiondark' in e['class']:\n", | ||||
|     "            # If it is the three action icon\n", | ||||
|     "            actions.append(3)\n", | ||||
|     "    except TypeError:\n", | ||||
|     "        # If there is no icon handle it as a string\n", | ||||
|     "        if 'to' in e:\n", | ||||
|     "            # If to exists it must be 1 to 3 actions\n", | ||||
|     "            actions.append(2)\n", | ||||
|     "        else:\n", | ||||
|     "            actions.append(e.split(' (')[0].strip())\n", | ||||
|     "        continue\n", | ||||
|     "    except KeyError:\n", | ||||
|     "        continue\n", | ||||
|     "        \n", | ||||
|     "components = content.find('b', text='Cast').next.next \\\n", | ||||
|     "    .replace('(', '').replace(')', ''). replace(',', '').split(' ')\n", | ||||
|     "components = [x for x in components if x\n", | ||||
|     "              in ['material', 'somatic', 'verbal']]\n", | ||||
|     "\n", | ||||
|     "try:\n", | ||||
|     "    # If the spell has a range\n", | ||||
|     "    spell_range = content.find('b', text='Range').next.next.replace(';', '')\n", | ||||
|     "except AttributeError:\n", | ||||
|     "    # If the spell doesn't have a range\n", | ||||
|     "    spell_range = None\n", | ||||
|     "\n", | ||||
|     "target = content.find('hr').previous.strip()\n", | ||||
|     "\n", | ||||
|     "# save TODO\n", | ||||
|     "\n", | ||||
|     "# duration TODO\n", | ||||
|     "\n", | ||||
|     "## Description section\n", | ||||
|     "description = content.find('hr').next\n", | ||||
|     "print(actions)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [] | ||||
|   } | ||||
|  ], | ||||
|  "metadata": { | ||||
|   "kernelspec": { | ||||
|    "display_name": "Python 3", | ||||
|    "language": "python", | ||||
|    "name": "python3" | ||||
|   }, | ||||
|   "language_info": { | ||||
|    "codemirror_mode": { | ||||
|     "name": "ipython", | ||||
|     "version": 3 | ||||
|    }, | ||||
|    "file_extension": ".py", | ||||
|    "mimetype": "text/x-python", | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.6.9" | ||||
|   } | ||||
|  }, | ||||
|  "nbformat": 4, | ||||
|  "nbformat_minor": 4 | ||||
| } | ||||
		Loading…
	
		Reference in New Issue