I have a folder of csv files which I need to load into the R environment, gather statistics by multiple groups and then update a list, with the end result being accumulated values per group. Here is some dummy data:-
df1<-structure(list(Date = c("01/01/2021", "02/01/2021", "03/01/2021",
"04/01/2021", "05/01/2021", "06/01/2021", "07/01/2021", "08/01/2021",
"09/01/2021", "10/01/2021", "11/01/2021", "12/01/2021", "13/01/2021",
"14/01/2021", "15/01/2021", "16/01/2021", "17/01/2021", "18/01/2021",
"19/01/2021", "20/01/2021", "21/01/2021", "22/01/2021", "23/01/2021",
"01/01/2021", "02/01/2021", "03/01/2021", "04/01/2021", "05/01/2021",
"06/01/2021", "07/01/2021", "08/01/2021", "09/01/2021", "10/01/2021",
"11/01/2021", "12/01/2021", "13/01/2021", "14/01/2021", "01/12/2021",
"02/12/2021", "03/12/2021", "04/12/2021", "05/12/2021", "06/12/2021",
"07/12/2021", "08/12/2021", "09/12/2021", "10/12/2021", "11/12/2021",
"12/12/2021", "13/12/2021", "14/12/2021", "15/12/2021"), ID = c("ABC",
"ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC",
"ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC",
"ABC", "ABC", "ABC", "ABC", "DEF", "DEF", "DEF", "DEF", "DEF",
"DEF", "DEF", "DEF", "DEF", "DEF", "DEF", "DEF", "DEF", "DEF",
"GHI", "GHI", "GHI", "GHI", "GHI", "GHI", "GHI", "GHI", "GHI",
"GHI", "GHI", "GHI", "GHI", "GHI", "GHI"), Type = c("Type1",
"Type2", "Type1", "Type1", "Type1", "Type1", "Type1", "Type2",
"Type1", "Type2", "Type1", "Type2", "Type1", "Type2", "Type1",
"Type2", "Type1", "Type1", "Type1", "Type1", "Type2", "Type1",
"Type2", "Type1", "Type2", "Type1", "Type1", "Type1", "Type1",
"Type1", "Type2", "Type1", "Type2", "Type1", "Type2", "Type1",
"Type2", "Type1", "Type1", "Type1", "Type2", "Type1", "Type2",
"Type1", "Type2", "Type1", "Type1", "Type1", "Type1", "Type1",
"Type1", "Type2")), class = "data.frame", row.names = c(NA, -52L
))
df2<-structure(list(Date = c("01/04/2022", "02/04/2022", "03/04/2022",
"04/04/2022", "05/04/2022", "06/04/2022", "01/07/2022", "02/07/2022",
"03/07/2022", "04/07/2022", "05/07/2022", "01/12/2022", "02/12/2022",
"03/12/2022", "04/12/2022", "05/12/2022", "06/12/2022", "07/12/2022",
"08/12/2022", "09/12/2022", "10/12/2022", "01/02/2022", "02/02/2022",
"03/02/2022", "04/02/2022", "05/02/2022", "06/02/2022", "07/02/2022",
"08/02/2022", "09/02/2022", "10/02/2022", "01/04/2022", "02/04/2022",
"03/04/2022", "04/04/2022", "05/04/2022", "06/04/2022", "07/04/2022",
"08/04/2022", "09/04/2022", "10/04/2022", "11/04/2022", "12/04/2022",
"13/04/2022", "14/04/2022", "15/04/2022", "16/04/2022", "17/04/2022",
"18/04/2022", "19/04/2022", "20/04/2022", "21/04/2022", "22/04/2022",
"23/04/2022", "24/04/2022", "25/04/2022", "26/04/2022", "27/04/2022",
"28/04/2022", "29/04/2022", "30/04/2022", "01/05/2022", "02/05/2022",
"03/05/2022", "04/05/2022", "05/05/2022", "06/05/2022", "07/05/2022",
"08/05/2022", "09/05/2022", "10/05/2022", "11/05/2022", "12/05/2022",
"13/05/2022", "14/05/2022", "15/05/2022", "16/05/2022", "17/05/2022",
"18/05/2022", "19/05/2022", "20/05/2022", "21/05/2022", "22/05/2022",
"23/05/2022", "24/05/2022"), ID = c("ABC", "ABC", "ABC", "ABC",
"ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC",
"ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "HIJ",
"HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ",
"LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN",
"LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN",
"LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN",
"LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN",
"LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN",
"LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN"
), Type = c("Type2", "Type2", "Type1", "Type2", "Type1", "Type2",
"Type2", "Type2", "Type2", "Type2", "Type1", "Type2", "Type1",
"Type2", "Type1", "Type2", "Type2", "Type1", "Type2", "Type2",
"Type1", "Type2", "Type1", "Type2", "Type2", "Type2", "Type1",
"Type2", "Type1", "Type2", "Type1", "Type2", "Type1", "Type2",
"Type1", "Type2", "Type1", "Type2", "Type1", "Type2", "Type1",
"Type2", "Type2", "Type1", "Type2", "Type1", "Type2", "Type1",
"Type2", "Type1", "Type2", "Type1", "Type2", "Type1", "Type2",
"Type1", "Type2", "Type1", "Type2", "Type1", "Type2", "Type1",
"Type2", "Type1", "Type2", "Type1", "Type2", "Type1", "Type2",
"Type1", "Type2", "Type2", "Type1", "Type2", "Type2", "Type1",
"Type2", "Type1", "Type2", "Type1", "Type2", "Type2", "Type1",
"Type2", "Type1")), class = "data.frame", row.names = c(NA, -85L
))
df3<-structure(list(Date = c("01/01/2021", "02/01/2021", "03/01/2021",
"04/01/2021", "05/01/2021", "06/01/2021", "07/01/2021", "08/01/2021",
"09/01/2021", "01/07/2023", "02/07/2023", "03/07/2023", "04/07/2023",
"05/07/2023", "06/07/2023", "07/07/2023", "08/07/2023", "09/07/2023",
"10/07/2023", "11/07/2023", "12/07/2023", "13/07/2023"), ID = c("ABC",
"ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "HIJ",
"HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ",
"HIJ", "HIJ", "HIJ"), Type = c("Type1", "Type2", "Type1", "Type1",
"Type2", "Type1", "Type2", "Type1", "Type2", "Type1", "Type1",
"Type1", "Type1", "Type1", "Type1", "Type1", "Type1", "Type2",
"Type2", "Type2", "Type2", "Type2")), class = "data.frame", row.names = c(NA,
-22L))
write.csv(df1, file="df1.csv")
write.csv(df2, file="df2.csv")
write.csv(df3, file="df3.csv")
I have a list created where I want to store some outputs of a group-by analysis of each csv file that I load in:-
#creating sequence of month and years
start_date <- as.Date("2021-01-01")
end_date <- as.Date("2023-12-31")
date_sequence <- seq.Date(from = start_date, to = end_date, by = "1 months")
formatted_dates <- format(date_sequence, "%Y-%m")
formatted_dates
#creating a unique identifier
unique_id<-c("ABC","DEF","GHI","HIJ","LMN")
#creating a list to store results in
month_year_list<-list()
for (i in unique_id) {
counts_month_year<-data.frame(Month_Yr = c(formatted_dates),
Type1 = c(NA),
Type2 = c(NA))
month_year_list[[i]]<-counts_month_year
}
Lastly, here is the code I am working with:-
#saving the file names to an object in order to load in
all_files = list.files(pattern = ".csv")
for (i in all_files) {
df<-read_csv(i)#loading in csv
#getting grouped counts
grouped_analysis<-df%>%
mutate(new_date=dmy(Date), Month_Yr=as.factor(substr(new_date,1,7)))%>%
group_by(ID,Month_Yr,Type)%>%
count(Month_Yr,Type, .drop=FALSE)%>%
spread(Type,n)
#removing NA column and replacing NA's with zeroes
grouped_analysis$`<NA>`<-NULL
grouped_analysis$Type1[is.na(grouped_analysis$Type1)]<-0
grouped_analysis$Type2[is.na(grouped_analysis$Type2)]<-0
ind <- match(grouped_analysis$ID, unique_id)#matching ID
#inner for loop
for (j in ind) {
df_id<-month_year_list[[j]]
df_update <- grouped_analysis[grouped_analysis$ID == unique_id[j],]
month_year_list[[j]] <- df_id%>%
left_join(df_update, by = "Month_Yr")%>%
mutate(Type1 = coalesce(Type1.x + Type1.y, Type1.x, Type1.y),
Type2 = coalesce(Type2.x + Type2.y, Type2.x, Type2.y))%>%
select(Month_Yr, Type1, Type2)
}#gathering grouped counts and updating the list
}
I want to be able to add the values from each iteration of the grouped analysis onto the values which pre-exist in the list. However, whenever I do this with the current code, the values artificially inflate in some instances.
For instance, ID “ABC” appears in all three csv files and has the following counts:-
# table(df1$ID,df1$Type)
#
# Type1 Type2
# ABC 15 8
# DEF 9 5
# GHI 11 4
# > table(df2$ID,df2$Type)
#
# Type1 Type2
# ABC 7 14
# HIJ 4 6
# LMN 25 29
# > table(df3$ID,df3$Type)
#
# Type1 Type2
# ABC 5 4
# HIJ 8 5
So in total, ABC should have 27 for Type1 and 26 for Type2 (spread over the Month_Yr variable).
But when I assign the values to the list, the values are artificially inflated.
How may I fix this so that the accumulated values by ID and Month_Yr are correct? Is there an accurate and more efficient way to achieve what I am after? Thanks!