Web Scraping

An example of webscraping using BeautifulSoup.

Code

# import libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import requests
from bs4 import BeautifulSoup
import re

My goal is to collect information on the Colorado 14ers, which are peaks in Colorado over 14,000 feet. There are 53 ranked 14ers, which means they have at least 300 feet of prominence, and an additional 5 unranked peaks.

The website I’ll be scraping from is here, and plan to extract the following information:

Peaks - Peak Name - Link to Peak’s Page - Number of Routes - Elevatoin - Rank - Mountain Range - National Park - Forest - Latitude/Longitude - County - Towns - Member Ascents - Winter Ascents - Ski Ascents - From a Peak’s Standard Route: - Class - Altitude Gained - Distance Traveled

Members

As a note, there is a member database containing personal statistics of individuals on the 14ers. This could be interesting to come back to.

Code

# main 14er list: contains a table with high level information
url = 'https://www.14ers.com/14ers'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')

# extract name, link, and number of routes
main_table = soup.find('table')
table_rows_html = main_table.find_all('tr')[1:]
url_start = 'https://www.14ers.com'
peak_names = [row.find('a').text.replace('\xa0','') for row in table_rows_html]
peak_links = [f'{url_start}{row.find("a").get("href")}' for row in table_rows_html]
num_routes = [int(row.find_all('td')[-3].text) for row in table_rows_html]

basics = {'peak': peak_names, 'links': peak_links, 'routes': num_routes,
             'elevation': [], 'rank': [], 'mountain_range': [], 'nat_park': [],
             'forest': [], 'lat_long': [], 'county': [], 'towns': [],
             'member_ascents': [], 'winter_ascents': [], 'ski_descents': []}

Code

# function to return key stats not listed on the main 14er list page, only on the specialized peaks' pages
def get_peak_stats(url):
    # create return dictionary
    peak_stats = {'elevation': None, 'rank': None, 'mountain_range': None,
                  'nat_park': None, 'forest': None, 'lat_long': None,
                  'county': None, 'towns': None, 'member_ascents': None,
                  'winter_ascents': None, 'ski_descents': None}
    
    # go to page
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'lxml')
    
    # main stats
    stats_html = soup.find('div', class_='sidebar_content')
    all_stats_html = stats_html.find_all('div', class_='stat')

    # elevation
    elevation = int(re.search(r'\d{2},\d{3}', all_stats_html[0].text).group().replace(',',''))
    peak_stats['elevation'] = elevation

    # rank
    rank = all_stats_html[2].text.replace('CO 14er Rank', '')
    peak_stats['rank'] = rank
 
    # range
    mountain_range = all_stats_html[3].text.replace('Range', '')
    peak_stats['mountain_range'] = mountain_range
    
    '''
    Nat Park and Forest(s) may or may not be present, and most of the next entries have
    potential to be plural or singular.
    
    Inconsistencies after Range necessitate conditionals.
        - Nat Park (if any)
        - Forest vs Forests (if any)
        - Lat/Long
        - County vs Counties
        - Town vs. Towns (should always be "Towns", but just in case)
    '''
    for stat in all_stats_html:
        # Nat Park
        stat_text = stat.text
        if 'Nat Park' in stat_text:
            nat_park = stat_text.replace('Nat Park', '')
            peak_stats['nat_park'] = nat_park
        # Forest(s)
        elif 'Forest' in stat_text:
            forest_text = stat_text
            if 'Forests' in forest_text:
                forest_html = stat.find_all('a')
                forest = [forest.text for forest in forest_html]
            else:
                forest = [forest_text.replace('Forest', '')]
            peak_stats['forest'] = forest
        # Lat/Long
        elif 'Lat/Lon' in stat_text:
            lat_long = [float(loc) for loc in stat_text.replace('Lat/Lon', '').split(', ')]
            peak_stats['lat_long'] = lat_long
        # County or Counties
        elif 'Count' in stat_text:
            if 'County' in stat_text:
                county = [stat_text.replace('County', '')]
            else:
                county = str(stat.find('span', class_='value'))
                county = county.replace('<span class="value">', '')
                county = county.replace('</span>', '').split('<br/>')
            peak_stats['county'] = county
        # Town(s)
        elif 'Town' in stat_text:
            if 'Towns' in stat_text:
                towns = stat_text.replace('Towns', '').split(', ')
            else:
                towns = [stat_text.replace('Towns', '')]
            peak_stats['towns'] = towns

    # the checklists secion contains ascent and descent data
    checklists_html = soup.find('ul', class_='bulleted')
    all_checklists_html = checklists_html.find_all('li')

    # member ascents
    member_ascents = int(re.search(r'(.*)( Member Ascents)', all_checklists_html[0].text).groups()[0].replace(',', ''))
    peak_stats['member_ascents'] = member_ascents
    
    # winter ascents
    winter_ascents = int(re.search(r'(.*)( Member Winter Ascents)', all_checklists_html[1].text).groups()[0].replace(',', ''))
    peak_stats['winter_ascents'] = winter_ascents

    # ski descents
    ski_descents = int(re.search(r'(.*)( Member Ski Descents)', all_checklists_html[2].text).groups()[0].replace(',', ''))
    peak_stats['ski_descents'] = ski_descents

    return peak_stats

Code

# run function for all peaks
for link in peak_links:
    stats = get_peak_stats(link)
    for stat in stats:
        basics[stat].append(stats[stat])

# turn the basics into a dataframe
df_basics = pd.DataFrame(basics)

df_basics.head()

	peak	links	routes	elevation	rank	mountain_range	nat_park	forest	lat_long	county	towns	member_ascents	winter_ascents	ski_descents
0	Mount Elbert	https://www.14ers.com/peaks/10001/mount-elbert	4	14438	1 of 53	Sawatch	None	[San Isabel]	[39.118075, -106.445417]	[Lake]	[Leadville, Twin Lakes, Aspen]	15906	601	278
1	Mount Massive	https://www.14ers.com/peaks/10002/mount-massive	3	14427	2 of 53	Sawatch	None	[San Isabel]	[39.187298, -106.475548]	[Lake]	[Leadville, Aspen]	11221	225	101
2	Mount Harvard	https://www.14ers.com/peaks/10003/mount-harvard	3	14424	3 of 53	Sawatch	None	[San Isabel]	[38.924328, -106.320618]	[Chaffee]	[Granite, Buena Vista, Leadville]	8717	101	61
3	Blanca Peak	https://www.14ers.com/peaks/10004/blanca-peak	3	14350	4 of 53	Sangre de Cristo	None	[Rio Grande, San Isabel]	[37.577473, -105.485443]	[Alamosa, Huerfano, Costilla]	[Fort Garland, Blanca, Alamosa]	5727	102	49
4	La Plata Peak	https://www.14ers.com/peaks/10005/la-plata-peak	4	14344	5 of 53	Sawatch	None	[San Isabel]	[39.029251, -106.473145]	[Chaffee]	[Twin Lakes, Leadville, Buena Vista, Aspen]	10825	378	150

Note that the forest, lat_long, county, and towns attributes are list variables. The number of entries within the lists could be an attribute of interest themselves, but would like be most useful if unpacked.

Code

# Note that most peaks have multiple routes (in *routes* attribute), this links to information about the "standard" routes
url = 'https://www.14ers.com/routes_bydifficulty.php'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')

# return the html for all rows in the standard route tables
row_html = soup.find_all('tr')

fields = {'peak': [], 'class': [], 'altitude': [], 'distance': []}
for row in row_html:
    search = row.find_all('td')
    if len(search) == 1 and len(search[0].text) > 0:
        current_class = search[0].text
    elif len(search) > 1:
        fields['peak'].append(re.search(r'(<td>)(.*)(<div)', str(search[1])).groups()[1])
        fields['class'].append(current_class)
        fields['altitude'].append(int(search[3].text.replace("\'", '')))
        fields['distance'].append(float(search[4].text.replace(' mi', '')))

# turn standard route information into a dataframe
df_difficulty = pd.DataFrame(fields)

# lambda function to turn 'Mt.' into 'Mount' for consistency in our two dataframes
mt_to_mount = lambda peak: peak.replace('Mt.', 'Mount')
df_difficulty['peak'] = df_difficulty['peak'].apply(mt_to_mount)

df_difficulty.head()

	peak	class	altitude	distance
0	Handies Peak	Class 1	2500	5.75
1	Grays Peak	Class 1	3000	7.50
2	Torreys Peak	Class 1	3000	7.75
3	Quandary Peak	Class 1	3450	6.75
4	Mount Elbert	Class 1	4500	9.75