pathfinder-2-sqlite-MIRROR/scrape/scrape.ipynb

449 lines
9.0 KiB
Plaintext
Raw Normal View History

2019-08-07 16:41:55 -04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Scrape data from aon2e and generate csvs to import in to sqlite"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [],
"source": [
"# Dependencies\n",
"import pandas as pd\n",
"from bs4 import BeautifulSoup as bs\n",
"import requests\n",
"import time\n",
"import re\n",
"\n",
"# Pandas config\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Ancestries TODO"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# url that contains all the links\n",
"url_ancestry = 'https://2e.aonprd.com/Ancestries.aspx?ID='\n",
"\n",
"# Empty list to store the ancestry data\n",
"ancestry = []\n",
"\n",
"# Make the request to the aon2e\n",
"response_ancestry = requests.get(f'{url_ancestry}1')\n",
"\n",
"# Use BS4 html parser to generate soup\n",
"soup_ancestry = bs(response_ancestry.text, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Store the data needed from the soup\n",
"name = soup_ancestry.find(id='ctl00_MainContent_DetailedOutput').a.text\n",
"traits = [trait.a.text for trait in soup_ancestry.find_all(class_='trait')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Raw description\n",
"description = soup_ancestry.find(id='ctl00_MainContent_DetailedOutput').text"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Animal Companions TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Animals (Rentals/Sales) TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Arcane Schools TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Arcane Thesis TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Archetypes TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Armor TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Backgrounds TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Bloodlines TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Champion Causes TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Champion Tenets TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Classes TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Class Kits TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Class Sample Builds TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Conditions TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deities TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Doctrines TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Domains TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Druidic Orders TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Equipment TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Familiar Abilities TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feats TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hazards TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hunter's Edges TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Instincts TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Languages TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Muses TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Rackets TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Research Fields TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Rituals TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Rules TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Shields TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Skills TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Skills (General) TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Spells TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Traits TODO"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Weapons TODO"
]
},
{
"cell_type": "code",
"execution_count": 306,
"metadata": {},
"outputs": [],
"source": [
"# url that contains all the links\n",
"url_spells = 'https://2e.aonprd.com/Spells.aspx?ID='\n",
"\n",
"# Number of spells taken from https://2e.aonprd.com/Sources.aspx?ID=1\n",
"spell_number = 343\n",
"\n",
"# Make the request to the aon2e\n",
"response_spells = requests.get(f'{url_spells}{spell_number}')\n",
"\n",
"# Use BS4 html parser to generate soup\n",
"soup_spells = bs(response_spells.text, 'html.parser')\n",
"\n",
"# Select only the content\n",
"content = soup_spells.find(id='ctl00_MainContent_DetailedOutput')"
]
},
{
"cell_type": "code",
"execution_count": 323,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['1 minute']\n"
]
}
],
"source": [
"name, level = content.h1.text.replace(' ', '').split('Spell')\n",
"traits = [trait.a.text.strip() for trait in content.find_all(class_='trait')]\n",
"source = content.find(class_='external-link').text.strip()\n",
"traditions = [tradition.text.strip() for tradition\n",
" in content.find_all('a', href=re.compile(\"Tradition\"))]\n",
"\n",
"## Actions sections\n",
"actions = []\n",
"# Start at cast and then iterate over the next elements on the line\n",
"for e in content.find('b', text='Cast').next_siblings:\n",
" if e.name == 'br':\n",
" # If the end of the line is reached break the loop\n",
" break\n",
" elif e.name == 'hr':\n",
" # If a horizontal line is reached\n",
" break\n",
" try:\n",
" if e['alt']=='Single Action' and 'actiondark' in e['class']:\n",
" # If it's the single action icon\n",
" actions.append(1)\n",
" elif e['alt']=='Three Actions' and 'actiondark' in e['class']:\n",
" # If it is the three action icon\n",
" actions.append(3)\n",
" except TypeError:\n",
" # If there is no icon handle it as a string\n",
" if 'to' in e:\n",
" # If to exists it must be 1 to 3 actions\n",
" actions.append(2)\n",
" else:\n",
" actions.append(e.split(' (')[0].strip())\n",
" continue\n",
" except KeyError:\n",
" continue\n",
" \n",
"components = content.find('b', text='Cast').next.next \\\n",
" .replace('(', '').replace(')', ''). replace(',', '').split(' ')\n",
"components = [x for x in components if x\n",
" in ['material', 'somatic', 'verbal']]\n",
"\n",
"try:\n",
" # If the spell has a range\n",
" spell_range = content.find('b', text='Range').next.next.replace(';', '')\n",
"except AttributeError:\n",
" # If the spell doesn't have a range\n",
" spell_range = None\n",
"\n",
"target = content.find('hr').previous.strip()\n",
"\n",
"# save TODO\n",
"\n",
"# duration TODO\n",
"\n",
"## Description section\n",
"description = content.find('hr').next\n",
"print(actions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}