Pandas Intermediate

Use the Template to explore some further functionality of Pandas. Create new cells with # %% as necessary.

Use the Data Management section and the Pandas Documentation for help.

Template
# %%
# Import Pandas


# %%
# Read data from the CSV file (read_csv):
# https://gitlab.com/alping/python-data-science/-/raw/main/data/external/heart-disease.csv


# %%
# Inspect the data [.info, .describe]


# %%
# Get the number of males and females [.value_counts]


# %%
# Create a 2x2 table for the variables sex and exang (exercise-induced angina)


# %%
# Inspect the age distribution as a histogram [.hist] and
# adjust the number of bins


# %%
# Inspect the age distribution, stratified by sex


# %%
# Keep only observations with chol >200 [.query]


# %%
# Change the sex variable to have the values male/female,
# instead of 1/0 [.assign, .replace]


# %%
# Create a new categorical age variable, binning ages in
# decades (0, 10, 20, ...) [.assign, pd.cut]


# %%
# In a new data variable, using method chaining:
# - Change the sex variable and create the age variable as above, but in one assign statement
# - Rename the column "exang" to "exercise_angina" [rename]
# - Keep only those with age between 18 and 50, inclusive [query]
# - Sort by "chol" [sort_values]
Solution
# %%
# Import Pandas
import pandas as pd

# %%
# Read data from the CSV file (read_csv):
# https://gitlab.com/alping/python-data-science/-/raw/main/data/external/heart-disease.csv
data = pd.read_csv(
    "https://gitlab.com/alping/python-data-science/-/raw/main/data/external/heart-disease.csv"
)

data

# %%
# Inspect the data [.info, .describe]
data.info()
data.describe()

# %%
# Get the number of males and females [.value_counts]
data["sex"].value_counts()

# %%
# Create a 2x2 table for the variables sex and exang (exercise-induced angina)
pd.crosstab(data["sex"], data["exang"])

# Or
data[["sex", "exang"]].value_counts().unstack()

# %%
# Inspect the age distribution as a histogram [.hist] and
# adjust the number of bins
data.hist("age", bins=30)

# %%
# Inspect the age distribution, stratified by sex
data.hist("age", by="sex")

# Or
data.groupby("sex")["age"].hist(density=True, alpha=0.5)

# %%
# Keep only observations with chol >200 [.query]
data.query("chol > 200")

# Or

data[data["chol"] > 200]

# Or

data[data["chol"].gt(200)]

# Or

data.loc[data["chol"].gt(200)]

# %%
# Change the sex variable to have the values male/female,
# instead of 1/0 [.assign, .replace]
data.assign(sex=lambda x: x["sex"].replace({1: "male", 0: "female"}))

# %%
# Create a new categorical age variable, binning ages in
# decades (0, 10, 20, ...) [.assign, pd.cut]
data.assign(
    age_c=lambda x: pd.cut(x["age"], bins=range(0, 101, 10), include_lowest=True)
)

# %%
# In a new data variable, using method chaining:
# - Change the sex variable and create the age variable as above, but in one assign statement
# - Rename the column "exang" to "exercise_angina" [rename]
# - Keep only those with age between 18 and 50, inclusive [query]
# - Sort by "chol" [sort_values]
data_mod = (
    data.assign(
        sex=lambda x: x["sex"].replace({1: "male", 0: "female"}),
        age_c=lambda x: pd.cut(x["age"], bins=range(0, 101, 10), include_lowest=True),
    )
    .rename(columns={"exang": "exercise_angina"})
    .query("18 <= age <= 50")
)

data_mod