workshops

import pandas as pd
import numpy as np
import math
# Reading CSV files from GitHub
gapminder = pd.read_csv('https://raw.githubusercontent.com/csc-ubc-okanagan/workshops/a091bc6eae8b9045866c28dbd1848c7e072db5b1/data/gapminder.csv')
gapminder.to_csv('gapminder.csv', index=False)

1. If Statement

x = 5

if x < 6:
    print("x is less than 6")
if x == 5:
    print("x is equal to 5")
if x != 2:
    print("x is some other number than 2")
x is less than 6
x is equal to 5
x is some other number than 2

2. If-Else Statement

if x == 2:
    print("x is equal to 2")
else:
    print("x is not equal to 2")
x is not equal to 2

This conditional structure ensures that the program can appropriately respond to the specific condition of x being equal to 2 or not.

3. If-Elif-Else Statement

if x < 5:
    print("x is less than 5")
elif x > 5:
    print("x is greater than 5")
else:
    print("x is equal to 5")
x is equal to 5

This structure is an efficient way to handle multiple related conditions by checking them in a sequence until one of the conditions is met.

gapminder_cond = gapminder

Creating Categorical Data in Pandas


gapminder_cond = gapminder.copy()

# Create an empty variable to hold our ordered categorical data
# Initially, all values are set to NaN
gapminder_cond['income_level'] = pd.NA

# Define the categories and their order
income_levels = pd.CategoricalDtype(categories=["low-income", "middle-income", "high-income"], ordered=True)

# Start the loop to add values to income_level based on GDP values
for i in gapminder_cond.index:  # Iterating over the DataFrame index
    if gapminder_cond.loc[i, 'gdpPercap'] <= 10000:
        gapminder_cond.loc[i, 'income_level'] = 'low-income'
    elif gapminder_cond.loc[i, 'gdpPercap'] <= 75000:
        gapminder_cond.loc[i, 'income_level'] = 'middle-income'
    else:
        gapminder_cond.loc[i, 'income_level'] = 'high-income'

# Convert the 'income_level' column to ordered categorical type
gapminder_cond['income_level'] = gapminder_cond['income_level'].astype(income_levels)

# Summary of the 'income_level' column
print(gapminder_cond['income_level'].describe())

count           1704
unique             3
top       low-income
freq            1312
Name: income_level, dtype: object
gapminder_cond['income_level'] = pd.Categorical(
    np.where(gapminder_cond['gdpPercap'] <= 10000, 'low-income',
    np.where(gapminder_cond['gdpPercap'] <= 75000, 'middle-income', 'high-income')),
    categories=['low-income', 'middle-income', 'high-income'],
    ordered=True
)

print(gapminder_cond['income_level'].describe())
count           1704
unique             3
top       low-income
freq            1312
Name: income_level, dtype: object
# Assign values of 'below-average' if lifeExp is equal to or below 59.47
# and 'above-average' if above 59.47

lifeExp_cat = []
# Loop through each life expectancy value in the 'lifeExp' column
for x in gapminder_cond['lifeExp']:
    # Check if the life expectancy is below or equal to 59.47
    if x <= 59.47:
        # If it is, append 'below-average' to the list
        lifeExp_cat.append('below-average')
    else:
        # If it is not, append 'above-average' to the list
        lifeExp_cat.append('above-average')

# Assign the categorized list to the 'lifeExp_cat' column in the DataFrame
gapminder_cond['lifeExp_cat'] = lifeExp_cat
# Assign appropriate data type
# Define the categorical type with the specific order
life_exp_cat_type = pd.CategoricalDtype(categories=["below-average", "above-average"], ordered=True)

# Convert 'lifeExp_cat' to ordered categorical type
gapminder_cond['lifeExp_cat'] = gapminder_cond['lifeExp_cat'].astype(life_exp_cat_type)

# Summary of the 'lifeExp_cat' column
print(gapminder_cond['lifeExp_cat'].describe())

count              1704
unique                2
top       above-average
freq                895
Name: lifeExp_cat, dtype: object
gapminder_cond['lifeExp_cat'] = pd.Categorical(
    np.where(gapminder_cond['lifeExp'] <= 59.47, 'below-average', 'above-average'),
    categories=['below-average', 'above-average'],
    ordered=True
)

print(gapminder_cond['lifeExp_cat'].describe())
count              1704
unique                2
top       above-average
freq                895
Name: lifeExp_cat, dtype: object
# create a numeric list, equivalent to R's c(6:-4)
some_numbers = list(range(6, -5, -1))
some_numbers
[6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4]
# use numpy to take the square root, numpy will automatically generate NaN for negative numbers
sqrt_numbers = np.sqrt(some_numbers)
# print the result
print(sqrt_numbers)
[2.44948974 2.23606798 2.         1.73205081 1.41421356 1.
 0.                nan        nan        nan        nan]


/tmp/ipykernel_141/2103455644.py:2: RuntimeWarning: invalid value encountered in sqrt
  sqrt_numbers = np.sqrt(some_numbers)
# using a list comprehension with conditional to emulate R's ifelse()
# this will replace negative numbers with NaN before taking the square root
sqrt_numbers_ifelse = [x**0.5 if x >= 0 else np.nan for x in some_numbers]
# print the result
print(sqrt_numbers_ifelse)

[2.449489742783178, 2.23606797749979, 2.0, 1.7320508075688772, 1.4142135623730951, 1.0, 0.0, nan, nan, nan, nan]
# Iterate over the DataFrame using the iterrows() function
for i, row in gapminder_cond.iterrows():
    # Check the 'gdpPercap' column to determine the income level
    if row['gdpPercap'] <= 10000:
        gapminder_cond.at[i, 'income_level'] = 'low-income'
    elif row['gdpPercap'] <= 75000:
        gapminder_cond.at[i, 'income_level'] = 'middle-income'
    else:
        gapminder_cond.at[i, 'income_level'] = 'high-income'



# Using multiple conditions with numpy's select()
conditions = [
    gapminder_cond['pop'] <= 1000000,
    gapminder_cond['pop'] <= 100000000,
    gapminder_cond['pop'] > 100000000
]

choices = ['small', 'medium', 'large']

gapminder_cond['pop_size'] = pd.Categorical(
    np.select(conditions, choices, default=np.nan),
    categories=['small', 'medium', 'large'],
    ordered=True
)

print(gapminder_cond['pop_size'].describe())

count       1704
unique         3
top       medium
freq        1447
Name: pop_size, dtype: object

References

The provided content and techniques are based on documentation and resources from official Python, Pandas, and NumPy websites: