Pandas Intermediate

Use the Template to explore some further functionality of Pandas. Create new cells with # %% as necessary.

Use the Data Management section and the Pandas Documentation for help.

Template
# %%# Import Pandas# %%# Read data from the CSV file (read_csv):# https://gitlab.com/alping/python-data-science/-/raw/main/data/external/heart-disease.csv# %%# Inspect the data [.info, .describe]# %%# Get the number of males and females [.value_counts]# %%# Create a 2x2 table for the variables sex and exang (exercise-induced angina)# %%# Inspect the age distribution as a histogram [.hist] and# adjust the number of bins# %%# Inspect the age distribution, stratified by sex# %%# Keep only observations with chol >200 [.query]# %%# Change the sex variable to have the values male/female,# instead of 1/0 [.assign, .replace]# %%# Create a new categorical age variable, binning ages in# decades (0, 10, 20, ...) [.assign, pd.cut]# %%# In a new data variable, using method chaining:# - Change the sex variable and create the age variable as above, but in one assign statement# - Rename the column "exang" to "exercise_angina" [rename]# - Keep only those with age between 18 and 50, inclusive [query]# - Sort by "chol" [sort_values]
Solution
# %%# Import Pandasimport pandas as pd# %%# Read data from the CSV file (read_csv):# https://gitlab.com/alping/python-data-science/-/raw/main/data/external/heart-disease.csvdata = pd.read_csv(    "https://gitlab.com/alping/python-data-science/-/raw/main/data/external/heart-disease.csv")data# %%# Inspect the data [.info, .describe]data.info()data.describe()# %%# Get the number of males and females [.value_counts]data["sex"].value_counts()# %%# Create a 2x2 table for the variables sex and exang (exercise-induced angina)pd.crosstab(data["sex"], data["exang"])# Ordata[["sex", "exang"]].value_counts().unstack()# %%# Inspect the age distribution as a histogram [.hist] and# adjust the number of binsdata.hist("age", bins=30)# %%# Inspect the age distribution, stratified by sexdata.hist("age", by="sex")# Ordata.groupby("sex")["age"].hist(density=True, alpha=0.5)# %%# Keep only observations with chol >200 [.query]data.query("chol > 200")# Ordata[data["chol"] > 200]# Ordata[data["chol"].gt(200)]# Ordata.loc[data["chol"].gt(200)]# %%# Change the sex variable to have the values male/female,# instead of 1/0 [.assign, .replace]data.assign(sex=lambda x: x["sex"].replace({1: "male", 0: "female"}))# %%# Create a new categorical age variable, binning ages in# decades (0, 10, 20, ...) [.assign, pd.cut]data.assign(    age_c=lambda x: pd.cut(x["age"], bins=range(0, 101, 10), include_lowest=True))# %%# In a new data variable, using method chaining:# - Change the sex variable and create the age variable as above, but in one assign statement# - Rename the column "exang" to "exercise_angina" [rename]# - Keep only those with age between 18 and 50, inclusive [query]# - Sort by "chol" [sort_values]data_mod = (    data.assign(        sex=lambda x: x["sex"].replace({1: "male", 0: "female"}),        age_c=lambda x: pd.cut(x["age"], bins=range(0, 101, 10), include_lowest=True),    )    .rename(columns={"exang": "exercise_angina"})    .query("18 <= age <= 50"))data_mod