Importing Libraries
import requests from bs4 import BeautifulSoup import pandas as pd import re
Scraping Life Expectancy Data.
# URL of the webpage url = <url> # Send a request to the webpage response = requests.get(url) # Check if the request was successful if response.status_code == 200: # Parse the HTML content soup = BeautifulSoup(response.text, 'html.parser') # Find all the divs with the class 'The Class' data_divs = soup.find_all('div', class_='The Class') # Initialize lists to store the scraped data countries = [] total_life_expectancy = [] males = [] females = [] # Iterate through each div for div in data_divs: # Get the country name from the <a> tag country_tag = div.find('a') if country_tag: countries.append(country_tag.text.strip()) else: countries.append(None) # Append None if country not found # Get the total life expectancy from the first <strong> tag and the next text node total_life_expectancy_tag = div.find('strong') if total_life_expectancy_tag and total_life_expectancy_tag.next_sibling: total_life_expectancy.append(total_life_expectancy_tag.next_sibling.strip()) else: total_life_expectancy.append(None) # Append None if not found # Find all <strong> tags and extract male data from the second one strong_tags = div.find_all('strong') if len(strong_tags) > 1: # Ensure there is a second <strong> tag male_text = strong_tags[1].next_sibling # Text after the second <strong> males.append(male_text.strip() if male_text else None) else: males.append(None) # Append None if not enough <strong> tags # Extract female data from the third <strong> tag if len(strong_tags) > 2: # Ensure there is a third <strong> tag female_text = strong_tags[2].next_sibling # Text after the third <strong> females.append(female_text.strip() if female_text else None) else: females.append(None) # Append None if not enough <strong> tags # Create a DataFrame df = pd.DataFrame({ 'Country': countries, 'Total life expectancy': total_life_expectancy, 'Males': males, 'Females': females }) else: print(f"Failed to retrieve data: {response.status_code}")
df
Country | Total Life Expectancy | Males | Females |
---|---|---|---|
None | None | None | None |
Afghanistan | 54.4 years (2024 est.) | 52.8 years | 56.1 years |
Albania | 79.9 years (2024 est.) | 77.3 years | 82.8 years |
Algeria | 77.9 years (2024 est.) | 77.2 years | 78.7 years |
American Samoa | 75.8 years (2024 est.) | 73.4 years | 78.5 years |
... | ... | ... | ... |
None | None | None | None |
None | None | None | None |
None | None | None | None |
None | None | None | None |
None | None | None |
237 rows Γ 4 columns
Cleaning the Data
Removing all rows where 'Country' or 'Total life_expectancy' has a value of 'None'
df_cleaned = df.dropna(subset=['Country', 'Total life_expectancy']) df_cleaned.head()
Country | Total Life Expectancy | Males | Females |
---|---|---|---|
Afghanistan | 54.4 years (2024 est.) | 52.8 years | 56.1 years |
Albania | 79.9 years (2024 est.) | 77.3 years | 82.8 years |
Algeria | 77.9 years (2024 est.) | 77.2 years | 78.7 years |
American Samoa | 75.8 years (2024 est.) | 73.4 years | 78.5 years |
Andorra | 83.8 years (2024 est.) | 81.6 years | 86.2 years |
We only need the numerical values for the Total life_expectancy, Males and Females columns.
For the Total life_expectancy we will get rid of everything that follows the first whitespace.
# Function to clean total life_expectancy data def clean_total_life_expectancy(value): if isinstance(value, str): return value.split()[0] # Keep everything before the first whitespace return value # Applying the function df_cleaned['Total life_expectancy'] = df_cleaned['Total life_expectancy'].apply(clean_total_life_expectancy) df_cleaned.head()
Country | Total Life Expectancy | Males | Females |
---|---|---|---|
Afghanistan | 54.4 | 52.8 years | 56.1 years |
Albania | 79.9 | 77.3 years | 82.8 years |
Algeria | 77.9 | 77.2 years | 78.7 years |
American Samoa | 75.8 | 73.4 years | 78.5 years |
Andorra | 83.8 | 81.6 years | 86.2 years |
For the Males and Females column we will simply keep all digit values and remove all non-digit values.
# Function to clean male and female data def clean_life_expectancy(value): if isinstance(value, str): return re.sub(r'[^0-9]*$', '', value) # Remove everything after the last digit return value # Applying the function df_cleaned['Males'] = df_cleaned['Males'].apply(clean_life_expectancy) df_cleaned['Females'] = df_cleaned['Females'].apply(clean_life_expectancy) df_cleaned.head()
Country | Total Life Expectancy | Males | Females |
---|---|---|---|
Afghanistan | 54.4 | 52.8 | 56.1 |
Albania | 79.9 | 77.3 | 82.8 |
Algeria | 77.9 | 77.2 | 78.7 |
American Samoa | 75.8 | 73.4 | 78.5 |
Andorra | 83.8 | 81.6 | 86.2 |
Finally we can save the data as a 'CSV' file
# Save the cleaned DataFrame to a CSV file df_cleaned.to_csv('cleaned_life_expectancy_data.csv', index=False)
Scraping National Dishes Data.
url = url response = requests.get(url) soup = BeautifulSoup(response.text, "lxml")
country_dishes = soup.find_all('h3') country_dishes[:5]
[<h3>Note</h3>, <h3>Afghanistan (Kabuli Pulao)</h3>, <h3>Algeria (Couscous)</h3>, <h3>Australia (Meat Pie)</h3>, <h3>Austria (Wiener Schnitzel)</h3>]
Extracting only the Text from the HTML H3 tags
country_n_dish = [country_dish.text for country_dish in country_dishes] country_n_dish[:5]
['Note', 'Afghanistan (Kabuli Pulao)', 'Algeria (Couscous)', 'Australia (Meat Pie)', 'Austria (Wiener Schnitzel)']
Saving results to Dataframe
df = pd.DataFrame({'country + dish': country_n_dish}) df.head()
country + dish | |
---|---|
0 | Note |
1 | Afghanistan (Kabuli Pulao) |
2 | Algeria (Couscous) |
3 | Australia (Meat Pie) |
4 | Austria (Wiener Schnitzel) |
df.tail()
country + dish | |
---|---|
113 | Yemen (Saltah) |
114 | Get Updates Right to Your Inbox |
115 | Further Insights |
116 | Search |
117 | Latest Articles |
Removing all rows that do not contain the country with its national dish, by removing anything that does not contain a bracket "("
df = df[df["country + dish"].str.contains(" ()] df.head()
country + dish | |
---|---|
1 | Afghanistan (Kabuli Pulao) |
2 | Algeria (Couscous) |
3 | Australia (Meat Pie) |
4 | Austria (Wiener Schnitzel) |
5 | Azerbaijan (Plov) |
df.tail()
country + dish | |
---|---|
109 | Senegal (Thieboudienne) |
110 | Slovakia (BryndzovΓ© haluΕ‘ky) |
111 | Uganda (Matoke) |
112 | Venezuela (PabellΓ³n criollo) |
113 | Yemen (Saltah) |
Separating the country and Dish into their own columns
values = df["country + dish"].str.split(" (, n=1, expand=True) df["country"] = values[0] df["dish"] = values[1] df["country"].head()
1 Afghanistan 2 Algeria 3 Australia 4 Austria 5 Azerbaijan Name: country, dtype: object
df["dish"].head()
1 Kabuli Pulao) 2 Couscous) 3 Meat Pie) 4 Wiener Schnitzel) 5 Plov) Name: dish, dtype: object
Removing the brackets from the dish column using a lambda expression
df['dish'] = df['dish'].apply(lambda x: re.sub(r'[\\)]', '', x)) df["dish"].head()
1 Kabuli Pulao 2 Couscous 3 Meat Pie 4 Wiener Schnitzel 5 Plov Name: dish, dtype: object
df.head()
country + dish | country | dish | |
---|---|---|---|
1 | Afghanistan (Kabuli Pulao) | Afghanistan | Kabuli Pulao |
2 | Algeria (Couscous) | Algeria | Couscous |
3 | Australia (Meat Pie) | Australia | Meat Pie |
4 | Austria (Wiener Schnitzel) | Austria | Wiener Schnitzel |
5 | Azerbaijan (Plov) | Azerbaijan | Plov |
The "country + dish" column serves no meaningful purpose so it can be dropped
df.drop(['country + dish'], axis=1, inplace=True) df.head()
country | dish | |
---|---|---|
1 | Afghanistan | Kabuli Pulao |
2 | Algeria | Couscous |
3 | Australia | Meat Pie |
4 | Austria | Wiener Schnitzel |
5 | Azerbaijan | Plov |
Saving our results to a CSV file
df.to_csv("national_dishes.csv", index=False)