Data Management
Copy the code from the exercise below and try out the basics of data management with Polars
.
Create new cells with # %%
as necessary.
Basic Data Management
Section titled “Basic Data Management”# %%# Import packages polars as pl and col from polars
# %%# Create a simple Series from a list
# %%# Create a simple DataFrame from a dictionary
# %%# Import the hosp/patients.csv.gz data using read_csv (change path as applicable)
# %%# Inspect the data# - head / tail# - describe# - value_counts
# %%# Manipulate rows# - filter# - sort# - unique
# %%# Manipulate columns# - select# - with_columns# - rename
# %%# Use different operations# add / sub# mul / truediv
# %%# Make comparisons# - eq / ne# - gt / ge# - lt / le# - is_between# - is_in# - is_null / is_not_null# - not_
# %%# Use aggregations# min / max / mode# mean / median / quantile# std / IQR
# %%# Use horizontal aggregations# - sum_horizontal# - mean_horizontal
# %%# Group by some variable# - group_by / agg
# %%# Join/merge data# - join
# %%# Work with strings# - format
# %%# Try to chain many different operations together
# %%# Import packages polars as pl and col from polarsimport polars as plfrom polars import col
# %%# Create a simple Series from a listpl.Series([1, 2, 3, 4, 5])
# %%# Create a simple DataFrame from a dictionarypl.DataFrame( { "x": [1, 2, 3, 4, 5], "y": [6, 7, 8, 9, 10], })
# %%# Import the hosp/patients.csv.gz data using read_csv (change path as applicable)patients = pl.read_csv("../../../data/hosp/patients.csv.gz")
# %%# Inspect the data# - head / tail# - describe# - value_countspatients.head()patients.tail(10)patients.describe()patients["gender"].value_counts()
# %%# Manipulate rows# - filter# - sort# - uniquepatients.filter(col("gender").eq("M"))patients.sort("anchor_age")patients.unique("gender", keep="first")
# %%# Manipulate columns# - select# - with_columns# - renamepatients.select("gender", "dod")patients["gender", "dod"]
patients.select(is_adult=col("anchor_age").gt(18))
patients.select(col("^anchor_.*$"))
patients.with_columns(is_adult=col("anchor_age").gt(18))
patients.rename({"gender": "sex"})
# %%# Use different operations# add / sub# mul / truedivpatients.select( "anchor_age", "anchor_year", age_plus_5=col("anchor_age").add(5), age_minus_5=col("anchor_age").sub(5), year_minus_age=col("anchor_age").sub("anchor_year"), # Which is the same as the more verbose, but more flexible: # year_minus_age=col("anchor_age").sub(col("anchor_year")))
patients.select( "anchor_age", age_mul_5=col("anchor_age").mul(5), age_div_5=col("anchor_age").truediv(5),)
# %%# Make comparisons# - eq / ne# - gt / ge# - lt / le# - is_between# - is_in# - is_null / is_not_null# - not_patients.select( "gender", is_male=col("gender").eq("M"), is_not_male=col("gender").ne("M"),)
patients.select( "anchor_age", age_ge_21=col("anchor_age").ge(21), age_gt_21=col("anchor_age").gt(21),)
patients.select( "anchor_age", age_le_72=col("anchor_age").le(72), age_lt_72=col("anchor_age").lt(72),)
patients.select( "anchor_age", age_21_72=col("anchor_age").is_between(21, 72, closed="both"),)
patients.select( "anchor_year", year_2125_2134_2155=col("anchor_year").is_in([2125, 2134, 2155]),)
patients.select( "dod", dead=col("dod").is_null(), not_dead=col("dod").is_not_null(),)
patients.select( "gender", is_not_male=col("gender").eq("M").not_(),)
# %%# Use aggregations# min / max / mode# mean / median / quantile# std / IQRpatients.select( n=pl.len(), age_min=col("anchor_age").min(), age_max=col("anchor_age").max(), gender_mode=col("gender").mode(),)
patients.select( age_mean=col("anchor_age").mean(), age_median=col("anchor_age").median(), age_q25=col("anchor_age").quantile(0.25), age_q50=col("anchor_age").quantile(0.50), age_q75=col("anchor_age").quantile(0.75),)
patients.select( age_std=col("anchor_age").std(), age_iqr=col("anchor_age").quantile(0.75).sub(col("anchor_age").quantile(0.25)),)
# %%# Use horizontal aggregations# - sum_horizontal# - mean_horizontalpatients.select( "anchor_age", "anchor_year", sum_horizontal=pl.sum_horizontal("anchor_age", "anchor_year"), mean_horizontal=pl.mean_horizontal("anchor_age", "anchor_year"),)
# %%# Group by some variable# - group_by / aggpatients.group_by("gender").agg( n=pl.len(), age_mean=col("anchor_age").mean(), age_std=col("anchor_age").std(), age_median=col("anchor_age").median(), age_q25=col("anchor_age").quantile(0.25), age_q75=col("anchor_age").quantile(0.75),)
# %%# Join/merge data# - joinadmissions = pl.read_csv( "../../data/hosp/admissions.csv.gz", columns=["subject_id", "insurance"])
patients.select("subject_id", "gender", "anchor_age").join( admissions, on="subject_id", how="left", validate="1:m")
# %%# Work with strings# - formatpatients.select( str_column=pl.format( "Subject {} is {} and {} years old.", col("subject_id"), col("gender").replace({"F": "female", "M": "male"}), col("anchor_age"), )).write_clipboard()
# %%# Try to chain many different operations together( patients.with_columns( male=col("gender").eq("M"), is_working_age=col("anchor_age").is_between(18, 65), ) .drop("gender") .filter(col("anchor_year").gt(2140), "is_working_age") .group_by("male") .agg(pl.len(), col("dod").is_null().sum()) .with_columns(proportion=col("dod").truediv("len")))
Patient Admissions
Section titled “Patient Admissions”# %%# Patient Admissions# Tables: hosp/patients, hosp/admissions
# %%# 1. Import packages
# %%# 2. Load the data
# %%# 3. Examine data types
# %%# 4. Join tables on subject_id to create patient-admission dataset
# %%# 5. Check for missing data
# %%# 6. Cross-tabulate gender and marital status using groupby and pivot display it nicely
# %%# 7. Get age at admission and calculate the mean and standard deviation by sex/gender# Example: Patient with anchor_year=2150, anchor_age=60, and admission_year 2153,# then: age_at_admission = 60 + (2153 - 2150) = 63
# %%# 8. Get the length of stay (dischtime - admittime) in days and calculate the mean and# standard deviation by insurance type
# %%# Patient Admissions# Tables: hosp/patients, hosp/admissions
# %%# 1. Import packagesimport polars as plfrom polars import col
# %%# 2. Load the datapatients = pl.read_csv("../../../data/hosp/patients.csv.gz")admissions = pl.read_csv("../../../data/hosp/admissions.csv.gz")
# %%# 3. Examine data typespatients.head()admissions.head()
# %%# 4. Join tables on subject_id to create patient-admission datasetpat_adm = patients.join(admissions, on="subject_id", how="left", validate="1:m")
# %%# 5. Check for missing datapat_adm.select(pl.all().is_null().sum())
# %%# 6. Cross-tabulate gender and marital status using groupby and pivot display it nicelypat_adm.group_by("gender", "marital_status").len().pivot( "gender", index="marital_status")
# %%# 7. Get age at admission and calculate the mean and standard deviation by sex/gender# Example: Patient with anchor_year=2150, anchor_age=60, and admission_year 2153,# then: age_at_admission = 60 + (2153 - 2150) = 63( pat_adm.with_columns(admit_year=col("admittime").str.slice(0, 4).cast(int)) .with_columns( admit_age=col("anchor_age").add(col("admit_year").sub(col("anchor_year"))), ) .group_by("gender") .agg( pl.format( "{} ({})", col("admit_age").mean().round(1), col("admit_age").std().round(1), ) ))
# %%# 8. Get the length of stay (dischtime - admittime) in days and calculate the mean and# standard deviation by insurance type( pat_adm.with_columns( col("admittime").str.to_datetime(), col("dischtime").str.to_datetime(), ) .with_columns(length_of_stay=col("dischtime").sub(col("admittime")).dt.total_days()) .group_by("insurance") .agg( pl.format( "{} ({})", col("length_of_stay").mean().round(1), col("length_of_stay").std().round(1), ) ))
ICU Stays
Section titled “ICU Stays”# %%# 1. Import packages
# %%# 2. Load the data
# %%# 1. Merge ICU stays with hospital admissions
# %%# 2. Calculate ICU admission proportion per admission type
# ICU Stays# Tables: hosp/admissions, icu/icustays
# %%# 1. Import packagesimport polars as plfrom polars import col
# %%# 2. Load the dataadmissions = pl.read_csv("../../../data/hosp/admissions.csv.gz")icu_stays = pl.read_csv("../../../data/icu/icustays.csv.gz")
# %%# 1. Merge ICU stays with hospital admissionsadm_icu = admissions.join(icu_stays, on="hadm_id", how="left", validate="1:m")
# %%# 2. Calculate ICU admission proportion per admission type( adm_icu.group_by("admission_type") .agg(n_icu=col("stay_id").is_not_null().sum(), n_total=pl.len()) .with_columns(prop=col("n_icu").truediv(col("n_total")).mul(100).round(1)) .sort("prop", descending=True))
Laboratory Data
Section titled “Laboratory Data”# %%# Laboratory Data# Tables: hosp/labevents, hosp/d_labitems
# %%# 1. Import packages
# %%# 2. Load the data
# %%# 3. Join with d_labitems to get human-readable lab names
# %%# 4. Check what the most common tests were
# %%# 5. Check the proportion of abnormal lab values (flag variable) per category
# %%# 6. Calculate the mean and standard deviation per year for all Glucose values
# %%# Laboratory Data# Tables: hosp/labevents, hosp/d_labitems
# %%# 1. Import packagesimport polars as plfrom polars import col
# %%# 2. Load the datalab_events = pl.read_csv("../../../data/hosp/labevents.csv.gz")lab_items = pl.read_csv("../../../data/hosp/d_labitems.csv.gz")
# %%# 3. Join with d_labitems to get human-readable lab nameslab = lab_events.join(lab_items, on="itemid", how="left", validate="m:m")
# %%# 4. Check what the most common tests werelab["label"].value_counts(sort=True)
# %%# 5. Check the proportion of abnormal lab values (flag variable) per categorylab.group_by("category").agg( abnormal=pl.format( "{} ({}%)", col("flag").is_not_null().sum(), col("flag").is_not_null().mean().mul(100).round(1), ))
# %%# 6. Calculate the mean and standard deviation per year for all Glucose values( lab.filter(col("label").eq("Glucose")) .group_by(year=col("charttime").str.to_datetime().dt.year()) .agg( pl.format( "{} ({})", col("valuenum").mean().round(1), col("valuenum").std().round(1), ) ) .sort("year"))
Medication Data
Section titled “Medication Data”# %%# Medication Data# Tables: hosp/prescriptions, hosp/emar, hosp/emar_detail# 1. Standardize medication names (handle variations in naming)# 2. Calculate medication duration from prescriptions# 3. Join prescriptions with actual administration records (emar)# 4. Identify compliance: prescribed vs administered medications
# %%# Medication Data# Tables: hosp/prescriptions, hosp/emar, hosp/emar_detail# 1. Standardize medication names (handle variations in naming)# 2. Calculate medication duration from prescriptions# 3. Join prescriptions with actual administration records (emar)# 4. Identify compliance: prescribed vs administered medications
Study Population
Section titled “Study Population”# %%# Study Population# hosp/diagnoses_icd, hosp/d_icd_diagnoses, hosp/labevents, icu.chartevents# 1. Identify sepsis patients using ICD codes (ICD-10: A40-A41)# - Patient meets criteria if ANY diagnosis matches# 2. Add laboratory criteria (elevated WBC, lactate)# - WBC > 12,000 OR < 4,000 cells/μL# - Lactate >= 2.0 mmol/L# - Calculate using worst values within 24h of admission# - Patient meets criteria if >=1 lab is abnormal# 3. Include vital sign criteria (temperature, heart rate)# - Temperature (itemid: 223761): > 38°C OR < 36°C# - Heart Rate (itemid: 220045): > 90 bpm# - Respiratory Rate (itemid: 220210): > 20 breaths/min# - Calculate using worst values within 48h of admission# - Patient meets criteria if >=2 vital signs are abnormal# 4. Export final cohort with inclusion/exclusion documentation# - Include: Patients meeting ICD codes AND (>=1 abnormal labs OR >=2 vital signs)# - Exclude: age < 18, length_of_stay < 24 hours# - Summarize: total screened, excluded by each criterion, and included at each step
# %%# Study Population# hosp/diagnoses_icd, hosp/d_icd_diagnoses, hosp/labevents, icu.chartevents# 1. Identify sepsis patients using ICD codes (ICD-10: A40-A41)# - Patient meets criteria if ANY diagnosis matches# 2. Add laboratory criteria (elevated WBC, lactate)# - WBC > 12,000 OR < 4,000 cells/μL# - Lactate >= 2.0 mmol/L# - Calculate using worst values within 24h of admission# - Patient meets criteria if >=1 lab is abnormal# 3. Include vital sign criteria (temperature, heart rate)# - Temperature (itemid: 223761): > 38°C OR < 36°C# - Heart Rate (itemid: 220045): > 90 bpm# - Respiratory Rate (itemid: 220210): > 20 breaths/min# - Calculate using worst values within 48h of admission# - Patient meets criteria if >=2 vital signs are abnormal# 4. Export final cohort with inclusion/exclusion documentation# - Include: Patients meeting ICD codes AND (>=1 abnormal labs OR >=2 vital signs)# - Exclude: age < 18, length_of_stay < 24 hours# - Summarize: total screened, excluded by each criterion, and included at each step
Exploration
Section titled “Exploration”See what else you can find in the data. 🔎