WebScraping Code

Outline

Importing Libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
        

Life Expectancy Data Collection and Preparation

Scraping Life Expectancy Data.

# URL of the webpage
url = <url>

# Send a request to the webpage
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the divs with the class 'The Class'
    data_divs = soup.find_all('div', class_='The Class')

    # Initialize lists to store the scraped data
    countries = []
    total_life_expectancy = []
    males = []
    females = []

    # Iterate through each div
    for div in data_divs:
        # Get the country name from the <a> tag
        country_tag = div.find('a')
        if country_tag:
            countries.append(country_tag.text.strip())
        else:
            countries.append(None)  # Append None if country not found
        
        # Get the total life expectancy from the first <strong> tag and the next text node
        total_life_expectancy_tag = div.find('strong')
        if total_life_expectancy_tag and total_life_expectancy_tag.next_sibling:
            total_life_expectancy.append(total_life_expectancy_tag.next_sibling.strip())
        else:
            total_life_expectancy.append(None)  # Append None if not found

        # Find all <strong> tags and extract male data from the second one
        strong_tags = div.find_all('strong')
        if len(strong_tags) > 1:  # Ensure there is a second <strong> tag
            male_text = strong_tags[1].next_sibling  # Text after the second <strong>
            males.append(male_text.strip() if male_text else None)
        else:
            males.append(None)  # Append None if not enough <strong> tags

        # Extract female data from the third <strong> tag
        if len(strong_tags) > 2:  # Ensure there is a third <strong> tag
            female_text = strong_tags[2].next_sibling  # Text after the third <strong>
            females.append(female_text.strip() if female_text else None)
        else:
            females.append(None)  # Append None if not enough <strong> tags

    # Create a DataFrame
    df = pd.DataFrame({
        'Country': countries,
        'Total life expectancy': total_life_expectancy,
        'Males': males,
        'Females': females
    })

else:
    print(f"Failed to retrieve data: {response.status_code}")
            

df

Country Total Life Expectancy Males Females
None None None None
Afghanistan 54.4 years (2024 est.) 52.8 years 56.1 years
Albania 79.9 years (2024 est.) 77.3 years 82.8 years
Algeria 77.9 years (2024 est.) 77.2 years 78.7 years
American Samoa 75.8 years (2024 est.) 73.4 years 78.5 years
... ... ... ...
None None None None
None None None None
None None None None
None None None None
Instagram None None None

237 rows Γ— 4 columns


Cleaning the Data


Removing all rows where 'Country' or 'Total life_expectancy' has a value of 'None'

df_cleaned = df.dropna(subset=['Country', 'Total life_expectancy'])
df_cleaned.head()
            
Country Total Life Expectancy Males Females
Afghanistan 54.4 years (2024 est.) 52.8 years 56.1 years
Albania 79.9 years (2024 est.) 77.3 years 82.8 years
Algeria 77.9 years (2024 est.) 77.2 years 78.7 years
American Samoa 75.8 years (2024 est.) 73.4 years 78.5 years
Andorra 83.8 years (2024 est.) 81.6 years 86.2 years

We only need the numerical values for the Total life_expectancy, Males and Females columns.


For the Total life_expectancy we will get rid of everything that follows the first whitespace.

# Function to clean total life_expectancy data
def clean_total_life_expectancy(value):
    if isinstance(value, str):
        return value.split()[0]  # Keep everything before the first whitespace
    return value

# Applying the function
df_cleaned['Total life_expectancy'] = df_cleaned['Total life_expectancy'].apply(clean_total_life_expectancy)

df_cleaned.head()
            
Country Total Life Expectancy Males Females
Afghanistan 54.4 52.8 years 56.1 years
Albania 79.9 77.3 years 82.8 years
Algeria 77.9 77.2 years 78.7 years
American Samoa 75.8 73.4 years 78.5 years
Andorra 83.8 81.6 years 86.2 years

For the Males and Females column we will simply keep all digit values and remove all non-digit values.

# Function to clean male and female data
def clean_life_expectancy(value):
    if isinstance(value, str):
        return re.sub(r'[^0-9]*$', '', value)  # Remove everything after the last digit
    return value
        
# Applying the function
df_cleaned['Males'] = df_cleaned['Males'].apply(clean_life_expectancy)
df_cleaned['Females'] = df_cleaned['Females'].apply(clean_life_expectancy)
        
df_cleaned.head()
            
Country Total Life Expectancy Males Females
Afghanistan 54.4 52.8 56.1
Albania 79.9 77.3 82.8
Algeria 77.9 77.2 78.7
American Samoa 75.8 73.4 78.5
Andorra 83.8 81.6 86.2

Finally we can save the data as a 'CSV' file

# Save the cleaned DataFrame to a CSV file
df_cleaned.to_csv('cleaned_life_expectancy_data.csv', index=False)
            

National Dishes Data Collection and Preparation

Scraping National Dishes Data.

url = url
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
            

country_dishes = soup.find_all('h3')
country_dishes[:5]
            
            [<h3>Note</h3>,
            <h3>Afghanistan (Kabuli Pulao)</h3>,
            <h3>Algeria (Couscous)</h3>,
            <h3>Australia (Meat Pie)</h3>,
            <h3>Austria (Wiener Schnitzel)</h3>]
        

Extracting only the Text from the HTML H3 tags

country_n_dish = [country_dish.text for country_dish in country_dishes]
country_n_dish[:5]
            
            ['Note',
            'Afghanistan (Kabuli Pulao)',
            'Algeria (Couscous)',
            'Australia (Meat Pie)',
            'Austria (Wiener Schnitzel)']
        

Saving results to Dataframe

df = pd.DataFrame({'country + dish': country_n_dish})
df.head()
            
country + dish
0 Note
1 Afghanistan (Kabuli Pulao)
2 Algeria (Couscous)
3 Australia (Meat Pie)
4 Austria (Wiener Schnitzel)

df.tail()
            
country + dish
113 Yemen (Saltah)
114 Get Updates Right to Your Inbox
115 Further Insights
116 Search
117 Latest Articles

Removing all rows that do not contain the country with its national dish, by removing anything that does not contain a bracket "("

df = df[df["country + dish"].str.contains(" ()]
df.head()
            
country + dish
1 Afghanistan (Kabuli Pulao)
2 Algeria (Couscous)
3 Australia (Meat Pie)
4 Austria (Wiener Schnitzel)
5 Azerbaijan (Plov)

df.tail()
            
country + dish
109 Senegal (Thieboudienne)
110 Slovakia (BryndzovΓ© haluΕ‘ky)
111 Uganda (Matoke)
112 Venezuela (PabellΓ³n criollo)
113 Yemen (Saltah)

Separating the country and Dish into their own columns

values = df["country + dish"].str.split(" (, n=1, expand=True)
df["country"] = values[0]
df["dish"] = values[1]
        
df["country"].head()
            
            1    Afghanistan
            2        Algeria
            3      Australia
            4        Austria
            5     Azerbaijan
            Name: country, dtype: object
        

df["dish"].head()
            
            1        Kabuli Pulao)
            2            Couscous)
            3            Meat Pie)
            4    Wiener Schnitzel)
            5                Plov)
            Name: dish, dtype: object
        

Removing the brackets from the dish column using a lambda expression

df['dish'] = df['dish'].apply(lambda x: re.sub(r'[\\)]', '', x))
df["dish"].head()
            
            1        Kabuli Pulao
            2            Couscous
            3            Meat Pie
            4    Wiener Schnitzel
            5                Plov
            Name: dish, dtype: object
        

df.head()
            
country + dish country dish
1 Afghanistan (Kabuli Pulao) Afghanistan Kabuli Pulao
2 Algeria (Couscous) Algeria Couscous
3 Australia (Meat Pie) Australia Meat Pie
4 Austria (Wiener Schnitzel) Austria Wiener Schnitzel
5 Azerbaijan (Plov) Azerbaijan Plov

The "country + dish" column serves no meaningful purpose so it can be dropped

df.drop(['country + dish'], axis=1, inplace=True)
df.head()
            
country dish
1 Afghanistan Kabuli Pulao
2 Algeria Couscous
3 Australia Meat Pie
4 Austria Wiener Schnitzel
5 Azerbaijan Plov

Saving our results to a CSV file

df.to_csv("national_dishes.csv", index=False)