I have a df that is long where each participant is a row.
set.seed(123) # for reproducibility
participants <- c("Participant1", "Participant2", "Participant3")
symptoms <- paste0("Symptom", 1:5)
# Create wide format dataframe
main_cl_db <- data.frame(
participant_id = participants,
sapply(symptoms, function(sym) sample(c(0, 1), 3, replace = TRUE)) # Generate binary data for symptoms
)
# Display the main_cl_db
print("Original Data (wide format):")
print(main_cl_db)
I then create
main_cl_db_no_id <- main_cl_db %>% select(-participant_id)
– removing id so it’s only binary data of symptoms
main_cl_db_t <- t(main_cl_db_no_id)
– transposed for clustering where I am clustering the symptoms
I now want to go back to my original df called new_main_cl_db where it’s the same format but it has the cluster number derived from the analysis. Essentially, I want the same frame as main_cl_db
but it now has a new column called cluster
All the code for my analysis is below
# data prep ---------------------------------------------------------------
# Select relevant columns and prepare data
main_cl_db <- main_survey %>%
select(participant_id, starts_with("symptoms_new")) %>%
mutate(across(starts_with("symptoms_new"), ~ ifelse(is.na(.), 0, .))) %>%
mutate_if(is.double, as.factor) #ensures they are treated as factors/binary
# Remove participant_id column for clustering
main_cl_db_no_id <- main_cl_db %>% select(-participant_id)
# Transpose the data
main_cl_db_t <- t(main_cl_db_no_id)
#calcualte dissim matrix
jaccard_dist_symptoms <- dist(main_cl_db_t, method = "binary")
##function wrapped with set.seed
setseed_fct <- function(data) {
jaccard_dist <- dist(data, method = "binary")
set.seed(1)
hc <- hclust(jaccard_dist, method = "complete")
return(hc)
}
# use func
hc_symptoms <- setseed_fct(main_cl_db_t)
# cut tree
grp_symp <- cutree(hc_symptoms, k = 6)
table(grp_symp)