I have a protein alignment (CLUSTAL format) that looks as in this link
(also image attached) and would like to input into R and create a table that has
- position in SMARCE1
- residue in SMARCE1
- position in swsn-3_protein
- residue in swns-3_protein and
- the corresponding
.
,*
or:
Looks like:
Row 1 1 M _ _ N/A
Row36 36 Y 1 M N/A
Row37 37 N 2 S .
Row38 38 N 3 S .
Row39 39 Y 4 F :
I can read in this alignment file as per the read.alignment function from seqinrhttps://www.rdocumentation.org/packages/seqinr/versions/4.2-36/topics/read.alignment.
fasta.res <- read.alignment(file = "sample.fasta", format="fasta", whole.header =TRUE)
clustal.res <- read.alignment(file ="sample.aln", format="clustal")
I get for clustal.res
:
structure(list(nb = 2, nam = c("SMARCE1", "swsn-3"), seq = list(
"mskrpsyappptpapatqmpstpgfvgynpyshlaynnyrlg-gnpgtnsrvtassgitit59pkppkppdkplmpymrysrkvwdqvkasnpdlklweigkiiggmwrdltdeekqeylneyt119eaekieynesmkayhnspaylayinaksraeaaleeesrqrqsrmekgepymsiqpaedpt179ddyddgfsmkhtatarfqrnhrliseilsesvvpdvrsvvttarmqvlkrqvqslmvhqrt239kleaellqieerhqekkrkflestdsfnnelkrlcglkvevdmekiaaeiaqaeeqarkrt299qeereke-------aaeqaersqssivpeeeqaankgeekkddenipmeteethleettet352sqqngeegtstpedkesgqegvdsmaeegtsdsntgsesnsatveepptdpipedekkett411",
"-----------------------------------mssfrhpqatparn------fqdmst19srgpkvperplqpymrysrkmwpkvraenpeaqlwdigkmigkywldlpdgekshyqheyt79elekadyekqmkhfqgngisnfmink-grak----nnekmsrsrmdaggv--viqpv-det131ddggnelstrrlagvrfernnrlisdlfspsivtdtrtvvphhrmemlkrqatslgthqst191kleeeltklerahdnrkraiekgsddfqeqlkkavsekpvvdeekfeetvkeweeklatat251yeeykkkqettatqnansapll-rnlvmeetpktpeqekeqnvkekpaestetp------t304------------------aagtepstddgaiettttt--tttenkekpeekmee-----tt338"),
com = NA), class = "alignment")
And for fasta.res:structure(list(nb = 2, nam = c("SMARCE1", "swsn-3"), seq = list(
"mskrpsyappptpapatqmpstpgfvgynpyshlaynnyrlg-gnpgtnsrvtassgitipkppkppdkplmpymrysrkvwdqvkasnpdlklweigkiiggmwrdltdeekqeylneyeaekieynesmkayhnspaylayinaksraeaaleeesrqrqsrmekgepymsiqpaedpddyddgfsmkhtatarfqrnhrliseilsesvvpdvrsvvttarmqvlkrqvqslmvhqrkleaellqieerhqekkrkflestdsfnnelkrlcglkvevdmekiaaeiaqaeeqarkrqeereke-------aaeqaersqssivpeeeqaankgeekkddenipmeteethleettesqqngeegtstpedkesgqegvdsmaeegtsdsntgsesnsatveepptdpipedekke",
"-----------------------------------mssfrhpqatparn------fqdmssrgpkvperplqpymrysrkmwpkvraenpeaqlwdigkmigkywldlpdgekshyqheyelekadyekqmkhfqgngisnfmink-grak----nnekmsrsrmdaggv--viqpv-deddggnelstrrlagvrfernnrlisdlfspsivtdtrtvvphhrmemlkrqatslgthqskleeeltklerahdnrkraiekgsddfqeqlkkavsekpvvdeekfeetvkeweeklatayeeykkkqettatqnansapll-rnlvmeetpktpeqekeqnvkekpaestetp------------------------aagtepstddgaiettttt--tttenkekpeekmee-----"),
com = NA), class = "alignment")
It has the sequence names and the residues. Not sure if the alignment is read in and trying to figure out how to get to the table I need from here.
1
Here is my approach to create the dataframe from CLUSTAL output.
library(readr)
library(dplyr)
library(stringr)
# Read each line as string then remove empty lines, skip header
clustal_raw <- read_lines("sample.aln", skip = 1)
clustal_raw <- clustal_raw[clustal_raw != ""]
# The trailing positions are separted from the sequences with tab in the example.
# The delim could also be space. Remove those
clustal_raw <- str_remove(clustal_raw, "(t| )[:digit:]+")
# The remaining part is a fix width structure
# Add id to the symbol lines
str_sub(clustal_raw[str_starts(clustal_raw, " ")], 1, 3) <- "symbol"
# Identify the position of the last space
last_space <- str_locate(clustal_raw, " +")[1, "end"]
# Read fixed width lines into dataframe
clustal_df <- read_fwf(I(clustal_raw),
fwf_cols(id = c(1, last_space), seq = c(last_space + 1, nchar(clustal_raw[1]))),
trim_ws = FALSE) %>%
mutate(id = str_remove_all(id, " ")) %>%
# Collapse sequences for each id
group_by(id) %>%
summarize(seq = paste0(seq, collapse = ""),
.groups = "drop")
# Get the squence names
seq_names <- clustal_df$id[1:(nrow(clustal_df) - 1)]
# Convert to vector
clustal_vec <- clustal_df$seq
names(clustal_vec) <- clustal_df$id
# Split vector by charachters, and convert to dataframe
output <- str_split(clustal_vec, pattern = "", simplify = TRUE) %>%
t() %>%
as.data.frame()
names(output) <- names(clustal_vec)
# This function takes a broken down sequence and return the position for
# each residue. Gaps return NAs.
GetPosition <- function(vec) {
# mark non-gap positions
vec_bool <- vec != "-"
# residue position +1 if for non-gap
vec_out <- cumsum(vec_bool)
vec_out[vec_bool == FALSE] <- NA
return(vec_out)
}
output <- output %>%
# Get the residue positions for each sequence
mutate(across(all_of(seq_names), GetPosition, .names = "{.col}_POS")) %>%
# Rearrange the column orders
relocate(ends_with("POS")) %>%
relocate(starts_with(seq_names)) %>%
# Replace space as NA in the symbol column.
mutate(symbol = ifelse(symbol == " ",
NA,
symbol))
slice(output, c(1, 36:39))
The output look like this:
SMARCE1_POS SMARCE1 swsn-3_protein_POS swsn-3_protein symbol
1 1 M NA - <NA>
2 36 Y 1 M <NA>
3 37 N 2 S .
4 38 N 3 S .
5 39 Y 4 F :
Chen is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.