As input, I’ve got a large data frame in R with lists of strings of different lengths, referring to certain codes – like this:
glt.code glt.phylogeny
1 adha1238 adha1238
2 adiv1239 adiv1239
3 adiw1235 adiw1235
4 aerr1238 jikr1238;jikr1238;aerr1238
I would like to replace the codes in the column glt.phylogeny
based on their name in this look-up table:
code name level
1 adha1238 Adhari language
2 adiv1239 Kotia-Adivasi Oriya-Desiya language
3 adiw1235 Adiwasi Garasia language
4 aerr1238 Aer language
5 jikr1238 Jikrio Aer dialect
My desired output looks like this:
glt.code glt.phylogeny.names
1 adha1238 Adhari
2 adiv1239 Kotia-Adivasi Oriya-Desiya
3 adiw1235 Adiwasi Garasia
4 aerr1238 Jikrio Aer;Jikrio Aer;Aer
I’d like to find a pipelined (dplyr) solution replacing all substrings in a column of a data frame based on a look-up table. I’ve experimented using str_replace_all
and stri_replace_all_fixed
, based on other questions on Stack Overflow but without a working result.
The actual data frame and look-up table is much larger than that so a scalable solution would be appreciated.
I think this will get you what you want using str_replace_all
by extracting a named vector from your lookup table:
library(tidyverse)
# data
df <- tibble(
glt.code = c("adha1238", "adiv1239", "adiw1235", "aerr1238"),
glt.phylogeny = c("adha1238", "adiv1239", "adiw1235", "jikr1238;jikr1238;aerr1238")
)
df_look_up <- tibble(
code = c("adha1238", "adiv1239", "adiw1235", "aerr1238", "jikr1238"),
name = c("Adhari", "Kotia-Adivasi Oriya-Desiya", "Adivasi Garasia", "Aer", "Jikrio Aer"),
level = c("language",
"language", "language", "language", "dialect")
)
# create named vector
named_look_up_vector <- df_look_up %>%
pull(name, code)
# use str_replace_all
df %>%
mutate(glt.phylogeny = str_replace_all(glt.phylogeny, named_look_up_vector))
# # A tibble: 4 × 2
# glt.code glt.phylogeny
# <chr> <chr>
# 1 adha1238 Adhari
# 2 adiv1239 Kotia-Adivasi Oriya-Desiya
# 3 adiw1235 Adivasi Garasia
# 4 aerr1238 Jikrio Aer;Jikrio Aer;Aer
1
Split the data on semi-colon, join it with lookup table and collapse the names into semi-colon separated string.
Assuming the dataframe are called df1
and df2
you can do :
library(dplyr)
library(tidyr)
df1 %>%
separate_longer_delim(glt.phylogeny, ";") %>%
left_join(df2, join_by(glt.phylogeny == code)) %>%
summarise(name = paste0(name, collapse = ";"), .by = glt.code)
# glt.code name
#1 adha1238 Adhari
#2 adiv1239 Kotia-Adivasi Oriya-Desiya
#3 adiw1235 Adiwasi Garasia
#4 aerr1238 Jikrio Aer;Jikrio Aer;Aer
data
df1 <- structure(list(glt.code = c("adha1238", "adiv1239", "adiw1235",
"aerr1238"), glt.phylogeny = c("adha1238", "adiv1239", "adiw1235",
"jikr1238;jikr1238;aerr1238")), class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(code = c("adha1238", "adiv1239", "adiw1235", "aerr1238",
"jikr1238"), name = c("Adhari", "Kotia-Adivasi Oriya-Desiya",
"Adiwasi Garasia", "Aer", "Jikrio Aer"), level = c("language",
"language", "language", "language", "dialect")),
class = "data.frame", row.names = c(NA, -5L))
1
Is this what you are looking for?.
library(dplyr)
library(stringr)
df1 <- tibble(
glt.code = c("adha1238", "adiv1239", "adiw1235", "aerr1238"),
glt.phylogeny = c("adha1238", "adiv1239", "adiw1235", "jikr1238;jikr1238;aerr1238")
)
df2 <- tibble(
code = c("adha1238", "adiv1239", "adiw1235", "aerr1238", "jikr1238"),
name = c("Adhari", "Kotia-Adivasi Oriya-Desiya", "Adivasi Garasia", "Aer", "Jikrio Aer")
)
# Function to replace codes with names
replace_codes_with_names <- function(codes, lookup_table) {
codes_list <- str_split(codes, ";")[[1]]
names_list <- lookup_table$name[match(codes_list, lookup_table$code)]
paste(names_list, collapse = ";")
}
# desired output
desired_output <- df1 %>%
rowwise() %>%
mutate(glt.phylogeny.names = replace_codes_with_names(glt.phylogeny, df2)) %>%
ungroup()
Using purrr::map
on the strsplit
codes, then match
the name with code
library(dplyr)
df %>%
summarize(glt.phylogeny.names = purrr::map(strsplit(glt.phylogeny, ";"), ~
paste0(loup$name[match(.x, loup$code)], collapse=";")), .by = glt.code)
output
glt.code glt.phylogeny.names
1 adha1238 Adhari
2 adiv1239 Kotia-Adivasi Oriya-Desiya
3 adiw1235 Adiwasi Garasia
4 aerr1238 Jikrio Aer;Jikrio Aer;Aer
If performance is an issue, try data.table
. An equivalent approach can be
library(data.table)
setDT(df)
df[, .(glt.phylogeny.names = sapply(strsplit(glt.phylogeny, ";"), (x)
paste0(loup$name[match(x, loup$code)], collapse=";"))), by = glt.code]
glt.code glt.phylogeny.names
<char> <char>
1: adha1238 Adhari
2: adiv1239 Kotia-Adivasi Oriya-Desiya
3: adiw1235 Adiwasi Garasia
4: aerr1238 Jikrio Aer;Jikrio Aer;Aer
Data
df <- structure(list(glt.code = c("adha1238", "adiv1239", "adiw1235",
"aerr1238"), glt.phylogeny = c("adha1238", "adiv1239", "adiw1235",
"jikr1238;jikr1238;aerr1238")), class = "data.frame", row.names = c("1",
"2", "3", "4"))
loup <- structure(list(code = c("adha1238", "adiv1239", "adiw1235",
"aerr1238", "jikr1238"), name = c("Adhari", "Kotia-Adivasi Oriya-Desiya",
"Adiwasi Garasia", "Aer", "Jikrio Aer"), level = c("language",
"language", "language", "language", "dialect")), class =
"data.frame", row.names = c("1", "2", "3", "4", "5"))