library("dplyr")
library("readr")
library("tidyr")
This is my solution to the sample data manipulation task published by PreDoc.org.
You can find the full task here. This is exercise 2 from data task 1.
The task is to load some health insurance subscription data, manipulate the data to develop a summary of certain variables by county (rather than by insurance plan), and then output that data in a specified format.
Load libraries.
Read Data
<- read_csv("./task/scp-1205.csv", col_names = FALSE)
medicare
# `contract` is missing from the list of variables given in the instructions
# it is included with the name `contract` in the sample row
names(medicare) <- c(
"countyname",
"state",
"contract",
"healthplanname",
"typeofplan",
"countyssa",
"eligibles",
"enrollees",
"penetration",
"ABrate"
)
Fill Missing Values
<- medicare |>
medicare_filled_missing mutate(
across(
c(eligibles, enrollees, penetration),
~ replace_na(.x, 0)
) )
Complete Objective
<- medicare_filled_missing |>
medicare_by_county # Filter territories and District of Columbia
filter(!state %in% c("PR", "GU", "DC", "VI", "AS")) |>
# Filter unspecified counties
filter(!(is.na(state) | state == 99)) |>
group_by(countyname) |>
summarize(
# first() is used where all the counties will have the same value
state = first(state),
numberofplans1 = sum(enrollees > 10),
numberofplans2 = sum(penetration > .5),
countyssa = first(countyssa),
eligibles = first(eligibles),
totalenrollees = sum(enrollees),
totalpenetration = 100 * totalenrollees / eligibles
|>
) arrange(state, countyname)
Write to CSV
# Commented out for the web writeup.
# write_csv(medicare_by_county, "medicare_output.csv")
# Instead print some of the data.
head(medicare_by_county, 100)