Create scrape.ipynb
parent
e0c87292ad
commit
9bea77e449
|
@ -0,0 +1,448 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Scrape data from aon2e and generate csvs to import in to sqlite"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 173,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Dependencies\n",
|
||||
"import pandas as pd\n",
|
||||
"from bs4 import BeautifulSoup as bs\n",
|
||||
"import requests\n",
|
||||
"import time\n",
|
||||
"import re\n",
|
||||
"\n",
|
||||
"# Pandas config\n",
|
||||
"pd.set_option('display.max_columns', None)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Ancestries TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# url that contains all the links\n",
|
||||
"url_ancestry = 'https://2e.aonprd.com/Ancestries.aspx?ID='\n",
|
||||
"\n",
|
||||
"# Empty list to store the ancestry data\n",
|
||||
"ancestry = []\n",
|
||||
"\n",
|
||||
"# Make the request to the aon2e\n",
|
||||
"response_ancestry = requests.get(f'{url_ancestry}1')\n",
|
||||
"\n",
|
||||
"# Use BS4 html parser to generate soup\n",
|
||||
"soup_ancestry = bs(response_ancestry.text, 'html.parser')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Store the data needed from the soup\n",
|
||||
"name = soup_ancestry.find(id='ctl00_MainContent_DetailedOutput').a.text\n",
|
||||
"traits = [trait.a.text for trait in soup_ancestry.find_all(class_='trait')]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Raw description\n",
|
||||
"description = soup_ancestry.find(id='ctl00_MainContent_DetailedOutput').text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Animal Companions TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Animals (Rentals/Sales) TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Arcane Schools TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Arcane Thesis TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Archetypes TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Armor TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Backgrounds TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Bloodlines TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Champion Causes TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Champion Tenets TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Classes TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Class Kits TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Class Sample Builds TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Conditions TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Deities TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Doctrines TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Domains TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Druidic Orders TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Equipment TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Familiar Abilities TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Feats TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Hazards TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Hunter's Edges TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Instincts TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Languages TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Muses TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Rackets TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Research Fields TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Rituals TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Rules TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Shields TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Skills TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Skills (General) TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Spells TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Traits TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Weapons TODO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 306,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# url that contains all the links\n",
|
||||
"url_spells = 'https://2e.aonprd.com/Spells.aspx?ID='\n",
|
||||
"\n",
|
||||
"# Number of spells taken from https://2e.aonprd.com/Sources.aspx?ID=1\n",
|
||||
"spell_number = 343\n",
|
||||
"\n",
|
||||
"# Make the request to the aon2e\n",
|
||||
"response_spells = requests.get(f'{url_spells}{spell_number}')\n",
|
||||
"\n",
|
||||
"# Use BS4 html parser to generate soup\n",
|
||||
"soup_spells = bs(response_spells.text, 'html.parser')\n",
|
||||
"\n",
|
||||
"# Select only the content\n",
|
||||
"content = soup_spells.find(id='ctl00_MainContent_DetailedOutput')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 323,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['1 minute']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"name, level = content.h1.text.replace(' ', '').split('Spell')\n",
|
||||
"traits = [trait.a.text.strip() for trait in content.find_all(class_='trait')]\n",
|
||||
"source = content.find(class_='external-link').text.strip()\n",
|
||||
"traditions = [tradition.text.strip() for tradition\n",
|
||||
" in content.find_all('a', href=re.compile(\"Tradition\"))]\n",
|
||||
"\n",
|
||||
"## Actions sections\n",
|
||||
"actions = []\n",
|
||||
"# Start at cast and then iterate over the next elements on the line\n",
|
||||
"for e in content.find('b', text='Cast').next_siblings:\n",
|
||||
" if e.name == 'br':\n",
|
||||
" # If the end of the line is reached break the loop\n",
|
||||
" break\n",
|
||||
" elif e.name == 'hr':\n",
|
||||
" # If a horizontal line is reached\n",
|
||||
" break\n",
|
||||
" try:\n",
|
||||
" if e['alt']=='Single Action' and 'actiondark' in e['class']:\n",
|
||||
" # If it's the single action icon\n",
|
||||
" actions.append(1)\n",
|
||||
" elif e['alt']=='Three Actions' and 'actiondark' in e['class']:\n",
|
||||
" # If it is the three action icon\n",
|
||||
" actions.append(3)\n",
|
||||
" except TypeError:\n",
|
||||
" # If there is no icon handle it as a string\n",
|
||||
" if 'to' in e:\n",
|
||||
" # If to exists it must be 1 to 3 actions\n",
|
||||
" actions.append(2)\n",
|
||||
" else:\n",
|
||||
" actions.append(e.split(' (')[0].strip())\n",
|
||||
" continue\n",
|
||||
" except KeyError:\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
"components = content.find('b', text='Cast').next.next \\\n",
|
||||
" .replace('(', '').replace(')', ''). replace(',', '').split(' ')\n",
|
||||
"components = [x for x in components if x\n",
|
||||
" in ['material', 'somatic', 'verbal']]\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" # If the spell has a range\n",
|
||||
" spell_range = content.find('b', text='Range').next.next.replace(';', '')\n",
|
||||
"except AttributeError:\n",
|
||||
" # If the spell doesn't have a range\n",
|
||||
" spell_range = None\n",
|
||||
"\n",
|
||||
"target = content.find('hr').previous.strip()\n",
|
||||
"\n",
|
||||
"# save TODO\n",
|
||||
"\n",
|
||||
"# duration TODO\n",
|
||||
"\n",
|
||||
"## Description section\n",
|
||||
"description = content.find('hr').next\n",
|
||||
"print(actions)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
Loading…
Reference in New Issue