Thiết kế website giá rẻ

Question

I have a folder of csv files which I need to load into the R environment, gather statistics by multiple groups and then update a list, with the end result being accumulated values per group. Here is some dummy data:-

df1<-structure(list(Date = c("01/01/2021", "02/01/2021", "03/01/2021", 
                       "04/01/2021", "05/01/2021", "06/01/2021", "07/01/2021", "08/01/2021", 
                       "09/01/2021", "10/01/2021", "11/01/2021", "12/01/2021", "13/01/2021", 
                       "14/01/2021", "15/01/2021", "16/01/2021", "17/01/2021", "18/01/2021", 
                       "19/01/2021", "20/01/2021", "21/01/2021", "22/01/2021", "23/01/2021", 
                       "01/01/2021", "02/01/2021", "03/01/2021", "04/01/2021", "05/01/2021", 
                       "06/01/2021", "07/01/2021", "08/01/2021", "09/01/2021", "10/01/2021", 
                       "11/01/2021", "12/01/2021", "13/01/2021", "14/01/2021", "01/12/2021", 
                       "02/12/2021", "03/12/2021", "04/12/2021", "05/12/2021", "06/12/2021", 
                       "07/12/2021", "08/12/2021", "09/12/2021", "10/12/2021", "11/12/2021", 
                       "12/12/2021", "13/12/2021", "14/12/2021", "15/12/2021"), ID = c("ABC", 
                                                                                       "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", 
                                                                                       "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", 
                                                                                       "ABC", "ABC", "ABC", "ABC", "DEF", "DEF", "DEF", "DEF", "DEF", 
                                                                                       "DEF", "DEF", "DEF", "DEF", "DEF", "DEF", "DEF", "DEF", "DEF", 
                                                                                       "GHI", "GHI", "GHI", "GHI", "GHI", "GHI", "GHI", "GHI", "GHI", 
                                                                                       "GHI", "GHI", "GHI", "GHI", "GHI", "GHI"), Type = c("Type1", 
                                                                                                                                           "Type2", "Type1", "Type1", "Type1", "Type1", "Type1", "Type2", 
                                                                                                                                           "Type1", "Type2", "Type1", "Type2", "Type1", "Type2", "Type1", 
                                                                                                                                           "Type2", "Type1", "Type1", "Type1", "Type1", "Type2", "Type1", 
                                                                                                                                           "Type2", "Type1", "Type2", "Type1", "Type1", "Type1", "Type1", 
                                                                                                                                           "Type1", "Type2", "Type1", "Type2", "Type1", "Type2", "Type1", 
                                                                                                                                           "Type2", "Type1", "Type1", "Type1", "Type2", "Type1", "Type2", 
                                                                                                                                           "Type1", "Type2", "Type1", "Type1", "Type1", "Type1", "Type1", 
                                                                                                                                           "Type1", "Type2")), class = "data.frame", row.names = c(NA, -52L
                                                                                                                                           ))

df2<-structure(list(Date = c("01/04/2022", "02/04/2022", "03/04/2022", 
                        "04/04/2022", "05/04/2022", "06/04/2022", "01/07/2022", "02/07/2022", 
                        "03/07/2022", "04/07/2022", "05/07/2022", "01/12/2022", "02/12/2022", 
                        "03/12/2022", "04/12/2022", "05/12/2022", "06/12/2022", "07/12/2022", 
                        "08/12/2022", "09/12/2022", "10/12/2022", "01/02/2022", "02/02/2022", 
                        "03/02/2022", "04/02/2022", "05/02/2022", "06/02/2022", "07/02/2022", 
                        "08/02/2022", "09/02/2022", "10/02/2022", "01/04/2022", "02/04/2022", 
                        "03/04/2022", "04/04/2022", "05/04/2022", "06/04/2022", "07/04/2022", 
                        "08/04/2022", "09/04/2022", "10/04/2022", "11/04/2022", "12/04/2022", 
                        "13/04/2022", "14/04/2022", "15/04/2022", "16/04/2022", "17/04/2022", 
                        "18/04/2022", "19/04/2022", "20/04/2022", "21/04/2022", "22/04/2022", 
                        "23/04/2022", "24/04/2022", "25/04/2022", "26/04/2022", "27/04/2022", 
                        "28/04/2022", "29/04/2022", "30/04/2022", "01/05/2022", "02/05/2022", 
                        "03/05/2022", "04/05/2022", "05/05/2022", "06/05/2022", "07/05/2022", 
                        "08/05/2022", "09/05/2022", "10/05/2022", "11/05/2022", "12/05/2022", 
                        "13/05/2022", "14/05/2022", "15/05/2022", "16/05/2022", "17/05/2022", 
                        "18/05/2022", "19/05/2022", "20/05/2022", "21/05/2022", "22/05/2022", 
                        "23/05/2022", "24/05/2022"), ID = c("ABC", "ABC", "ABC", "ABC", 
                                                            "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", 
                                                            "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "HIJ", 
                                                            "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", 
                                                            "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", 
                                                            "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", 
                                                            "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", 
                                                            "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", 
                                                            "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", 
                                                            "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN", "LMN"
                        ), Type = c("Type2", "Type2", "Type1", "Type2", "Type1", "Type2", 
                                    "Type2", "Type2", "Type2", "Type2", "Type1", "Type2", "Type1", 
                                    "Type2", "Type1", "Type2", "Type2", "Type1", "Type2", "Type2", 
                                    "Type1", "Type2", "Type1", "Type2", "Type2", "Type2", "Type1", 
                                    "Type2", "Type1", "Type2", "Type1", "Type2", "Type1", "Type2", 
                                    "Type1", "Type2", "Type1", "Type2", "Type1", "Type2", "Type1", 
                                    "Type2", "Type2", "Type1", "Type2", "Type1", "Type2", "Type1", 
                                    "Type2", "Type1", "Type2", "Type1", "Type2", "Type1", "Type2", 
                                    "Type1", "Type2", "Type1", "Type2", "Type1", "Type2", "Type1", 
                                    "Type2", "Type1", "Type2", "Type1", "Type2", "Type1", "Type2", 
                                    "Type1", "Type2", "Type2", "Type1", "Type2", "Type2", "Type1", 
                                    "Type2", "Type1", "Type2", "Type1", "Type2", "Type2", "Type1", 
                                    "Type2", "Type1")), class = "data.frame", row.names = c(NA, -85L
                                    ))

df3<-structure(list(Date = c("01/01/2021", "02/01/2021", "03/01/2021", 
                        "04/01/2021", "05/01/2021", "06/01/2021", "07/01/2021", "08/01/2021", 
                        "09/01/2021", "01/07/2023", "02/07/2023", "03/07/2023", "04/07/2023", 
                        "05/07/2023", "06/07/2023", "07/07/2023", "08/07/2023", "09/07/2023", 
                        "10/07/2023", "11/07/2023", "12/07/2023", "13/07/2023"), ID = c("ABC", 
                                                                                        "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "HIJ", 
                                                                                        "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", "HIJ", 
                                                                                        "HIJ", "HIJ", "HIJ"), Type = c("Type1", "Type2", "Type1", "Type1", 
                                                                                                                       "Type2", "Type1", "Type2", "Type1", "Type2", "Type1", "Type1", 
                                                                                                                       "Type1", "Type1", "Type1", "Type1", "Type1", "Type1", "Type2", 
                                                                                                                       "Type2", "Type2", "Type2", "Type2")), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                 -22L))




write.csv(df1, file="df1.csv")
write.csv(df2, file="df2.csv")
write.csv(df3, file="df3.csv")

I have a list created where I want to store some outputs of a group-by analysis of each csv file that I load in:-



#creating sequence of month and years
start_date <- as.Date("2021-01-01")
end_date <- as.Date("2023-12-31")
date_sequence <- seq.Date(from = start_date, to = end_date, by = "1 months")
formatted_dates <- format(date_sequence, "%Y-%m")
formatted_dates


#creating a unique identifier
unique_id<-c("ABC","DEF","GHI","HIJ","LMN")




#creating a list to store results in

month_year_list<-list()


for (i in unique_id) {
  
  counts_month_year<-data.frame(Month_Yr = c(formatted_dates),
                                Type1 = c(NA),
                                Type2 = c(NA))
  
  month_year_list[[i]]<-counts_month_year
}

Lastly, here is the code I am working with:-

#saving the file names to an object in order to load in
all_files = list.files(pattern = ".csv")



for (i in all_files) {
  
  df<-read_csv(i)#loading in csv
  #getting grouped counts
  grouped_analysis<-df%>%
    mutate(new_date=dmy(Date), Month_Yr=as.factor(substr(new_date,1,7)))%>%
    group_by(ID,Month_Yr,Type)%>%
    count(Month_Yr,Type, .drop=FALSE)%>%
    spread(Type,n)
  
  #removing NA column and replacing NA's with zeroes
  grouped_analysis$`<NA>`<-NULL
  grouped_analysis$Type1[is.na(grouped_analysis$Type1)]<-0
  grouped_analysis$Type2[is.na(grouped_analysis$Type2)]<-0
  
  
  ind <- match(grouped_analysis$ID, unique_id)#matching ID
  
  
  #inner for loop
  
  for (j in ind) {
    
    df_id<-month_year_list[[j]]
    
    df_update <- grouped_analysis[grouped_analysis$ID == unique_id[j],]
    
    month_year_list[[j]] <- df_id%>%
      left_join(df_update, by = "Month_Yr")%>%
      mutate(Type1 = coalesce(Type1.x + Type1.y, Type1.x, Type1.y),
             Type2 = coalesce(Type2.x + Type2.y, Type2.x, Type2.y))%>%
      select(Month_Yr, Type1, Type2)
    
  }#gathering grouped counts and updating the list
}

I want to be able to add the values from each iteration of the grouped analysis onto the values which pre-exist in the list. However, whenever I do this with the current code, the values artificially inflate in some instances.

For instance, ID “ABC” appears in all three csv files and has the following counts:-

# table(df1$ID,df1$Type)
# 
# Type1 Type2
# ABC    15     8
# DEF     9     5
# GHI    11     4
# > table(df2$ID,df2$Type)
# 
# Type1 Type2
# ABC     7    14
# HIJ     4     6
# LMN    25    29
# > table(df3$ID,df3$Type)
# 
# Type1 Type2
# ABC     5     4
# HIJ     8     5

So in total, ABC should have 27 for Type1 and 26 for Type2 (spread over the Month_Yr variable).

But when I assign the values to the list, the values are artificially inflated.

How may I fix this so that the accumulated values by ID and Month_Yr are correct? Is there an accurate and more efficient way to achieve what I am after? Thanks!

Thiết kế website giá rẻ

Danh mục

Updating list in R; accumulating values by multiple groups