import requests, bs4
import pandas as pd
from urllib.parse import urljoin
import plotly.express as px


def get_soup(url):
    res = requests.get(url)
    res.raise_for_status()
    return bs4.BeautifulSoup(res.text, "lxml")


def get_urls(page_url):
    link_list = []
    soup = get_soup(page_url)
    areas = soup.find_all("div", {"class": "area"})
    for area in areas:
        try:
            link = area.find('a', {"class": "mappin located"})['href']
        except TypeError:
            continue

        guide = urljoin('https://www.thecrag.com', link)
        link_list.append(urljoin(guide, ' ').strip())
    return link_list


def get_lat_long(full_url):
    guide = urljoin(full_url, 'guide')
    soup_lat = get_soup(guide)
    a = None
    for p in soup_lat.find_all('p'):
        if 'Lat / Long:' in p.text:
            a = p
    return a.text.split(':')[1].split(',') if a else [None, None]


def get_crags_from_reg(url):
    classes = ['name', 'routes']
    df = pd.DataFrame(columns=['name', 'Sport', 'routes', 'url', 'lat', 'long'])
    soup = get_soup(url)
    crags = soup.find_all("div", {"class": "area"})
    
    for i, crag in enumerate(crags):
        a = crag.find('div', {"class": "loc"})
        
        try:
            link = a.find('a', {"class": "mappin located"})['href']
        except TypeError:
            continue
        
        full_link = f'https://www.thecrag.com{link}'
        guide = urljoin(full_link, 'guide')
        lat_long = get_lat_long(guide)

        df.loc[i, 'lat'] = lat_long[0]
        df.loc[i, 'long'] = lat_long[1]
        df.loc[i, 'Sport'] = 'Climbing'
        df.loc[i, 'url'] = urljoin(full_link, ' ').strip()

        for cls in classes:
            df.loc[i, cls] = crag.find('div', {"class": cls}).text
    df.routes = df['routes'].str.replace(',', '')
    df = df.dropna()
    df = df.astype({'name': str, 'Sport': str, 'routes': int, 'url': str, 'lat': float, 'long': float})
    return df


df = pd.DataFrame()
aus = 'https://www.thecrag.com/en/climbing/australia'
#state = 'https://www.thecrag.com/en/climbing/australia/western-australia'
states = get_urls(aus)
for state in states:
    regions = get_urls(state)
    for crag in regions:
        reg = get_crags_from_reg(crag)
        df = pd.concat([df, reg], axis=0) if not df.empty else reg
df.to_excel("Aus.xlsx")


df = df.reset_index(drop=True)


df = pd.read_excel('Aus.xlsx')

df


import os
os.system('jupyter nbconvert --to html TheCragAPI.ipynb')

0

	Unnamed: 0	name	Sport	routes	url	lat	long
0	1	Arapiles / Djurite\ncrag\n	Climbing	3181	https://www.thecrag.com/en/climbing/australia/...	-36.756518	141.808342
1	2	\n\n Warning\nGrampians / Gariwerd\ncrag\n	Climbing	8751	https://www.thecrag.com/en/climbing/australia/...	-37.155923	142.758590
2	3	Mt Hope\ncrag\n	Climbing	64	https://www.thecrag.com/en/climbing/australia/...	-35.989768	144.200941
3	4	Mt Kooyoora\ncrag\n	Climbing	66	https://www.thecrag.com/en/climbing/australia/...	-36.585072	143.697828
4	5	Mt Cole\ncrag\n	Climbing	40	https://www.thecrag.com/en/climbing/australia/...	-37.226904	143.198301
...	...	...	...	...	...	...	...
479	1	Killiecrankie\ncrag\n	Climbing	106	https://www.thecrag.com/en/climbing/australia/...	-39.799621	147.863068
480	2	Mt Strzelecki\narea\n	Climbing	2	https://www.thecrag.com/en/climbing/australia/...	-40.203883	148.072358
481	1	2 tree buttress\ncrag\n	Climbing	4	https://www.thecrag.com/en/climbing/australia/...	-10.433873	105.666822
482	2	Red crab rock\narea\n	Climbing	3	https://www.thecrag.com/en/climbing/australia/...	-10.434245	105.666504
483	1	Lost world boulder\narea\n	Climbing	2	https://www.thecrag.com/en/climbing/australia/...	-10.535678	105.636355

Web scraping The Crag¶