Convert a protein alignment to a table in R

I have a protein alignment (CLUSTAL format) that looks as in this link
(also image attached) and would like to input into R and create a table that has

position in SMARCE1
residue in SMARCE1
position in swsn-3_protein
residue in swns-3_protein and
the corresponding ., * or :

Looks like:

Row 1   1  M _ _ N/A

Row36   36 Y 1 M N/A

Row37   37 N 2 S  .

Row38   38 N 3 S  .

Row39   39 Y 4 F  :

I can read in this alignment file as per the read.alignment function from seqinrhttps://www.rdocumentation.org/packages/seqinr/versions/4.2-36/topics/read.alignment.

fasta.res <- read.alignment(file = "sample.fasta", format="fasta", whole.header =TRUE)
clustal.res <- read.alignment(file ="sample.aln", format="clustal")

I get for clustal.res:

structure(list(nb = 2, nam = c("SMARCE1", "swsn-3"), seq = list(
    "mskrpsyappptpapatqmpstpgfvgynpyshlaynnyrlg-gnpgtnsrvtassgitit59pkppkppdkplmpymrysrkvwdqvkasnpdlklweigkiiggmwrdltdeekqeylneyt119eaekieynesmkayhnspaylayinaksraeaaleeesrqrqsrmekgepymsiqpaedpt179ddyddgfsmkhtatarfqrnhrliseilsesvvpdvrsvvttarmqvlkrqvqslmvhqrt239kleaellqieerhqekkrkflestdsfnnelkrlcglkvevdmekiaaeiaqaeeqarkrt299qeereke-------aaeqaersqssivpeeeqaankgeekkddenipmeteethleettet352sqqngeegtstpedkesgqegvdsmaeegtsdsntgsesnsatveepptdpipedekkett411", 
    "-----------------------------------mssfrhpqatparn------fqdmst19srgpkvperplqpymrysrkmwpkvraenpeaqlwdigkmigkywldlpdgekshyqheyt79elekadyekqmkhfqgngisnfmink-grak----nnekmsrsrmdaggv--viqpv-det131ddggnelstrrlagvrfernnrlisdlfspsivtdtrtvvphhrmemlkrqatslgthqst191kleeeltklerahdnrkraiekgsddfqeqlkkavsekpvvdeekfeetvkeweeklatat251yeeykkkqettatqnansapll-rnlvmeetpktpeqekeqnvkekpaestetp------t304------------------aagtepstddgaiettttt--tttenkekpeekmee-----tt338"), 
    com = NA), class = "alignment")
And for fasta.res:structure(list(nb = 2, nam = c("SMARCE1", "swsn-3"), seq = list(
    "mskrpsyappptpapatqmpstpgfvgynpyshlaynnyrlg-gnpgtnsrvtassgitipkppkppdkplmpymrysrkvwdqvkasnpdlklweigkiiggmwrdltdeekqeylneyeaekieynesmkayhnspaylayinaksraeaaleeesrqrqsrmekgepymsiqpaedpddyddgfsmkhtatarfqrnhrliseilsesvvpdvrsvvttarmqvlkrqvqslmvhqrkleaellqieerhqekkrkflestdsfnnelkrlcglkvevdmekiaaeiaqaeeqarkrqeereke-------aaeqaersqssivpeeeqaankgeekkddenipmeteethleettesqqngeegtstpedkesgqegvdsmaeegtsdsntgsesnsatveepptdpipedekke", 
    "-----------------------------------mssfrhpqatparn------fqdmssrgpkvperplqpymrysrkmwpkvraenpeaqlwdigkmigkywldlpdgekshyqheyelekadyekqmkhfqgngisnfmink-grak----nnekmsrsrmdaggv--viqpv-deddggnelstrrlagvrfernnrlisdlfspsivtdtrtvvphhrmemlkrqatslgthqskleeeltklerahdnrkraiekgsddfqeqlkkavsekpvvdeekfeetvkeweeklatayeeykkkqettatqnansapll-rnlvmeetpktpeqekeqnvkekpaestetp------------------------aagtepstddgaiettttt--tttenkekpeekmee-----"), 
    com = NA), class = "alignment")

It has the sequence names and the residues. Not sure if the alignment is read in and trying to figure out how to get to the table I need from here.

Recognized by R Language Collective

Here is my approach to create the dataframe from CLUSTAL output.

library(readr)
library(dplyr)
library(stringr)


# Read each line as string then remove empty lines, skip header
clustal_raw <- read_lines("sample.aln", skip = 1)
clustal_raw <- clustal_raw[clustal_raw != ""]

# The trailing positions are separted from the sequences with tab in the example.
# The delim could also be space. Remove those
clustal_raw <- str_remove(clustal_raw, "(t| )[:digit:]+")

# The remaining part is a fix width structure
# Add id to the symbol lines
str_sub(clustal_raw[str_starts(clustal_raw, "      ")], 1, 3) <- "symbol"

# Identify the position of the last space
last_space <- str_locate(clustal_raw, " +")[1, "end"]

# Read fixed width lines into dataframe
clustal_df <- read_fwf(I(clustal_raw), 
                       fwf_cols(id = c(1, last_space), seq = c(last_space + 1, nchar(clustal_raw[1]))),
                       trim_ws = FALSE) %>%
  mutate(id = str_remove_all(id, " ")) %>%
  # Collapse sequences for each id
  group_by(id) %>%
  summarize(seq = paste0(seq, collapse = ""),
            .groups = "drop")

# Get the squence names
seq_names <- clustal_df$id[1:(nrow(clustal_df) - 1)]

# Convert to vector
clustal_vec <- clustal_df$seq
names(clustal_vec) <- clustal_df$id

# Split vector by charachters, and convert to dataframe
output <- str_split(clustal_vec, pattern = "", simplify = TRUE) %>%
  t() %>%
  as.data.frame() 
names(output) <- names(clustal_vec)

# This function takes a broken down sequence and return the position for 
# each residue. Gaps return NAs.
GetPosition <- function(vec) {
  # mark non-gap positions
  vec_bool <- vec != "-"
  # residue position +1 if for non-gap
  vec_out <- cumsum(vec_bool)
  vec_out[vec_bool == FALSE] <- NA
  return(vec_out)
}

output <- output %>%
  # Get the residue positions for each sequence
  mutate(across(all_of(seq_names), GetPosition, .names = "{.col}_POS")) %>%
  # Rearrange the column orders
  relocate(ends_with("POS")) %>%
  relocate(starts_with(seq_names)) %>%
  # Replace space as NA in the symbol column.
  mutate(symbol = ifelse(symbol == " ",
                         NA,
                         symbol))

slice(output, c(1, 36:39))

The output look like this:

  SMARCE1_POS SMARCE1 swsn-3_protein_POS swsn-3_protein symbol
1           1       M                 NA              -   <NA>
2          36       Y                  1              M   <NA>
3          37       N                  2              S      .
4          38       N                  3              S      .
5          39       Y                  4              F      :

New contributor

Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa Dịch vụ tổ chức sự kiện 5 sao Thông tin về chúng tôi Dịch vụ sinh nhật bé trai Dịch vụ sinh nhật bé gái Sự kiện trọn gói Các tiết mục giải trí Dịch vụ bổ trợ Tiệc cưới sang trọng Dịch vụ khai trương Tư vấn tổ chức sự kiện Hình ảnh sự kiện Cập nhật tin tức Liên hệ ngay Thuê chú hề chuyên nghiệp Tiệc tất niên cho công ty Trang trí tiệc cuối năm Tiệc tất niên độc đáo Sinh nhật bé Hải Đăng Sinh nhật đáng yêu bé Khánh Vân Sinh nhật sang trọng Bích Ngân Tiệc sinh nhật bé Thanh Trang Dịch vụ ông già Noel Xiếc thú vui nhộn Biểu diễn xiếc quay đĩa Dịch vụ tổ chức tiệc uy tín Khám phá dịch vụ của chúng tôi Tiệc sinh nhật cho bé trai Trang trí tiệc cho bé gái Gói sự kiện chuyên nghiệp Chương trình giải trí hấp dẫn Dịch vụ hỗ trợ sự kiện Trang trí tiệc cưới đẹp Khởi đầu thành công với khai trương Chuyên gia tư vấn sự kiện Xem ảnh các sự kiện đẹp Tin mới về sự kiện Kết nối với đội ngũ chuyên gia Chú hề vui nhộn cho tiệc sinh nhật Ý tưởng tiệc cuối năm Tất niên độc đáo Trang trí tiệc hiện đại Tổ chức sinh nhật cho Hải Đăng Sinh nhật độc quyền Khánh Vân Phong cách tiệc Bích Ngân Trang trí tiệc bé Thanh Trang Thuê dịch vụ ông già Noel chuyên nghiệp Xem xiếc khỉ đặc sắc Xiếc quay đĩa thú vị

Filed under: Kiến thức lập trình - @ 14:51

Thẻ: rsequence-alignment

Thiết kế website giá rẻ

Danh mục

Convert a protein alignment to a table in R