High Performance Web Scraper

In May of 2024, I created a web scraper in Python with BeautifulSoup4 to extract all the thesis projects listed on Wesleyan's Digital Collections website (click!). A key requirement of this project was to utilize Wesleyan's High Performance Computing Cluster to parallelize and optimize my code. This was done through a mixture of smart coding practices as well as through various libraries: Pandas, Joblib. To further optimize my process and to test the efficacy of various optimization techniques, jobs were batched to the cluster using Slurm.
The libraries I tested or learned about were:

  • NumPy
  • Numba
  • PyPy
  • Numba for CUDA
  • Cython
  • Swifter
  • Dask
  • NSQ
  • functools
  • gevent
  • Tornado

I achieved nearly 100x speedup between my original and optimized versions of my web scraper.

ORIGINAL

#!/usr/bin/python3
# VAHN KESSLER

# Modules
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from timeit import default_timer as time

# Functions
def gatherData(soup):
    """
    Gather all the data from the website and return lists of the desired components.

    Returns
    -------
    title : BeautifulSoup list
        Title of every thesis.
    date : BeautifulSoup list
        Date for every thesis.
    author : string list
        Author for every thesis.
    advisor : string list
        Advisor(s) for every thesis.
    link : string list
        Link for every thesis (taken from title).
    length : int
        Length of the title list. Because every thesis has a title, this gives an accurate count of the amount of theses on each page.

    """
    # Patterns
    author_pattern = re.compile(r'Author: (.*)|Creator: (.*)')
    author2_pattern = re.compile(r'Author: (.*)')
    
    # Components
    title = soup.findAll("div", attrs={"class":"node__title"})
    length = len(title)
    date = soup.findAll("div", attrs={"class":"field__item","property":"dcterms:issued"})
    author = [
        author_pattern.search(div.text).group(1)
        for div in soup.findAll("div", attrs={"class": "field field--name-field-linked-agent field--type-typed-relation field--label-hidden field__items"})]
    advisor = [
        author2_pattern.sub('',div.text).replace('\n','')
        for div in soup.findAll("div", attrs={"class": "field field--name-field-linked-agent field--type-typed-relation field--label-hidden field__items"})]
    for i in range(len(advisor)):
        print(i,advisor[i])
    link = [
        "https://digitalcollections.wesleyan.edu/"+link['href']
        for div in soup.findAll("div", attrs={"class":"node__title"}) for link in div.find_all('a')]
    
    return title, date, author, advisor, link, length

def assembleDatabase(title,date,author,advisor,link,length,dictionary):
    """
    Assemble the dictionary

    Parameters
    ----------
    title : BeautifulSoup list
        Title of every thesis.
    date : BeautifulSoup list
        Date for every thesis.
    author : string list
        Author for every thesis.
    advisor : string list
        Advisor(s) for every thesis.
    link : string list
        Link for every thesis (taken from title).
    length : int
        Amount of theses on the page.
    dictionary : dict
        Dictionary to be updated with data.

    Returns
    -------
    dictionary : dict
        Dictionary updated with data.

    """
    
    # Variables
    creator_pattern = re.compile(r'Creator: (.*)')
    key = 0
    t_number = 0
    d_number = 0
    au_number = 0
    ad_number = 0
    l_number = 0
    
    while key < (length - 1):
        # Specify relevant data
        t = title[t_number].text
        d = date[d_number].text
        au = author[au_number]
        ad = advisor[ad_number]
        l = link[l_number]
        
        # Add "N/A" for "Creator" case
        creator_match = creator_pattern.search(ad)
        if creator_match:
            d = "N/A"
            au = creator_match.group(1)
            ad = "ttttttttttttttttN/A" # 16 characters for slice
        
        # Update dictionary
        dictionary[key] = {'Title':t,"Author":au,"Advisor(s)":ad[16:],"Date":d,"Link":l} # ad[:16] to remove "Thesis advisor: "
        
        # Iterate through the component lists
        if not creator_match:
            key += 1
            t_number += 1
            d_number += 1
            au_number += 1
            ad_number += 1
            l_number += 1
        else: # Don't iterate date because if you do it'll be out of sync
            key += 1
            t_number += 1
            au_number += 1
            ad_number += 1
            l_number += 1    
            
    return dictionary

if __name__=='__main__':
    
    # Setup
    # page_to_scrape = requests.get("https://digitalcollections.wesleyan.edu/islandora/object/wesleyanct-etd_hon_theses?f%5B0%5D=discipline%3AChemistry&items_per_page=50&page=%1C1")
    page_to_scrape = requests.get("https://digitalcollections.wesleyan.edu/islandora/object/wesleyanct-etd_hon_theses?items_per_page=50&f%5B0%5D=discipline%3AEnglish&page=%2C1")
    soup = BeautifulSoup(page_to_scrape.text, "html.parser")
    
    start = time()
    # Gather data
    (title, date, author, advisor, link, length) = gatherData(soup)
    
    # Create dictionary
    thesis_dictionary = dict()
    
    # Assemble dictionary
    thesis_dictionary = assembleDatabase(title, date, author, advisor, link, length, thesis_dictionary)
    
    print(time()-start)
    

OPTIMIZED (100X SPEEDUP)

#!/usr/bin/python3
# VAHN KESSLER
# Script optimized with list comprehension, pandas DataFrame, and parallelization with joblib

# Modules
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from urllib.parse import unquote
from joblib import Parallel, delayed
from timeit import default_timer as time

def gatherData(soup):
    """
    Gather all the data from the website and return lists of the desired components.
    
    Parameters
    ----------
    soup : BeautifulSoup object
        Parsed webpage.
    
    Returns
    -------
    titles : string list
        Title of every thesis.
    dates : string list
        Date of every thesis.
    authors : string list
        Author of every thesis.
    advisors : string list
        Advisor(s) of every thesis.
    links : string list
        Link for every thesis (taken from title).
    """
    # Patterns
    author_pattern = re.compile(r'Author: (.*)|Creator: (.*)')
    author2_pattern = re.compile(r'Author: (.*)')
    
    # Components
    titles = [
        div.text
        for div in soup.findAll("div", attrs={"class":"node__title"})
    ]
    dates = [
        div.text
        for div in soup.findAll("div", attrs={"class":"field__item","property":"dcterms:issued"})
    ]
    authors = [
        author_pattern.search(div.text).group(1)
        for div in soup.findAll("div", attrs={"class": "field field--name-field-linked-agent field--type-typed-relation field--label-hidden field__items"})
    ]
    advisors = [
        author2_pattern.sub('',div.text).replace('\n','')[16:] if not "Creator: " in div.text else div.text.replace('\n','')
        for div in soup.findAll("div", attrs={"class": "field field--name-field-linked-agent field--type-typed-relation field--label-hidden field__items"})
    ]
    links = [
        "https://digitalcollections.wesleyan.edu/"+link['href']
        for div in soup.findAll("div", attrs={"class":"node__title"})
        for link in div.find_all('a')
    ]

    return titles, dates, authors, advisors, links

def fixLists(titles, dates, authors, advisors, links, soup):
    """
    Function to take care of exceptional cases

    Parameters
    ----------
    titles : string list
        Title of every thesis.
    dates : string list
        Date of every thesis.
    authors : string list
        Author of every thesis.
    advisors : string list
        Advisor(s) of every thesis.
    links : string list
        Link of every thesis.
    soup : BeautifulSoup object
        Parsed webpage.

    Returns
    -------
    dates : string list
        Date of every thesis.
    authors : string list
        Author of every thesis.
    advisors : string list
        Advisor(s) of every thesis.

    """

    # Variables
    length = len(titles)
    collection_list = []
    i = 0
    advisor_pattern = re.compile(r', \d{4}-')
    
    # Logic for "items in Collection". Gets the titles so the loop knows where to insert data.
    collection = soup.findAll("div", attrs={"class":"view__collection-count views-row"})
    if len(collection) != 0:
        collection_parents = [
            c.parent.parent.parent.parent.parent for c in collection
        ]
        collection_titles = [
            cp.findAll("div", attrs={"class":"node__title"}) for cp in collection_parents
        ]
        for c in collection_titles:
            # Get rid of double list
            for a in c:
                collection_list.append(a.text)
                
        # Case for when "items in Collection" but thesis advisor and author are still listed
        collection_attributes = [
            cp.findAll("div", attrs={"class": "field field--name-field-linked-agent field--type-typed-relation field--label-hidden field__items"}) for cp in collection_parents
        ]
                
    # Parse through lists to make sure everything lines up
    while i < length:
        # Add "N/A" for "Creator" case
        if "Creator: " in advisors[i]:
            if len(dates) != length:
                dates.insert(i, "N/A")
            authors[i] = advisors[i][9:]
            advisors[i] = "N/A"

        # Account for multiple thesis advisors
        if "Thesis advisor: " in advisors[i]:
            advisors[i] = advisors[i].replace("Thesis advisor: ", " and ")

        # Account for ", 1960-" case (any 4 numbers)
        advisor_match = advisor_pattern.search(advisors[i])
        if advisor_match:
            advisors[i] = re.sub(advisor_pattern,'',advisors[i])

        # Account for "items in Collection" case
        for c in collection_list:
            if c == titles[i]:
                if collection_attributes[0] == []:
                    author, advisor = scrapeSubpage(links[i])
                    authors.insert(i, author)
                    advisors.insert(i, advisor)
        
        i += 1
            
    return dates, authors, advisors

def scrapeSubpage(page):
    """
    Function to scrape a page and identify the author and advisor in the event of an "items in Collection" case

    Parameters
    ----------
    page : string
        Page to scrape.

    Returns
    -------
    author[0] : string
        Author of the thesis.
    advisor[0] : string
        Advisor(s) of the thesis.

    """
    # Patterns
    page_to_scrape = requests.get(page)
    print(page)
    soup = BeautifulSoup(page_to_scrape.text, "html.parser")
    author_pattern = re.compile(r'Author: (.*?), Thesis')    
    advisor_pattern = re.compile(r'Thesis advisor: (.*?), Degree')
    
    # Components
    author = [
        author_pattern.search(span.text).group(1)
        for span in soup.findAll("span", attrs={"class": "views-field views-field-field-linked-agent"})
    ]
    advisor = [
        advisor_pattern.search(span.text).group(1)
        for span in soup.findAll("span", attrs={"class": "views-field views-field-field-linked-agent"})
    ]   
    
    return author[0], advisor[0]

def createDatabase(page):
    """
    Function to create the database

    Parameters
    ----------
    page : string
        Page to be scraped.

    Returns
    -------
    df : DataFrame
        DataFrame of page.

    """
    # Set up soup
    page_to_scrape = requests.get(page)
    soup = BeautifulSoup(page_to_scrape.text, "html.parser")
    
    # Gather data
    (titles, dates, authors, advisors, links) = gatherData(soup)
    
    # Fix lists
    (dates, authors, advisors) = fixLists(titles, dates, authors, advisors, links, soup)
    
    # Add discipline to DataFrame. Needs to be separate from everything else because there's only 1 discipline per page.
    discipline_pattern = re.compile(r'discipline%3A(.*?)($|&)') 
    discipline = unquote(discipline_pattern.search(page).group(1))
    disciplines = [discipline] * len(titles)

    # Create DataFrame
    df = pd.DataFrame({
        'Discipline': disciplines,
        'Title': titles,
        'Date': dates,
        'Author': authors,
        'Advisor(s)': advisors,
        'Link': links
    }) 
    
    # Concatinate dataFrames and return
    return df

def updateDatabase(pagelist):
    """
    Function to create the final database from the parallelized searches

    Parameters
    ----------
    pagelist : string list
        List of webpages.

    Returns
    -------
    df : dataFrame
        Final DataFrame.

    """
    # Using joblib to parallelize the creation of DataFrames for each page
    results = Parallel(n_jobs=4, verbose=10)(delayed(createDatabase)(page) for page in pagelist)
    
    # Concatenate all DataFrames into one
    df = pd.concat(results, ignore_index=True)

    return df

if __name__=='__main__':
    start = time()
    
    # Assemble webpage list from pages.txt
    with open('pages.txt', 'r') as file:
        pagelist = [line.strip() for line in file if line.strip()]

    # Build dataFrame
    theses = updateDatabase(pagelist)

    # Export to CSV
    theses.to_csv('theses.csv')

    print(time()-start)