I have a simple line plot where I show a DNA sequence on the x-axis, done in the following way with ggplot2
:
myseq <- "AGAATATTATACATTCATCT"
set.seed(123)
mydata <- data.frame(time=1:100, value=rnorm(100, mean=10, sd=2))
indices <- seq(5, 100, length.out=20)
seqsplit <- unlist(strsplit(myseq, ""))
ind_df <- data.frame(call=seqsplit, time=indices)
final_df <- dplyr::left_join(mydata, ind_df, by="time")
xcolors <- ifelse(seqsplit=="A", "green", ifelse(seqsplit=="C", "blue", ifelse(seqsplit=="G", "black", "red")))
P <- ggplot2::ggplot(final_df, ggplot2::aes(x=time, y=value)) +
ggplot2::geom_line(linewidth=0.5) +
ggplot2::scale_x_continuous(breaks=indices, labels=seqsplit) +
ggplot2::scale_y_continuous(limits=c(5,17)) +
ggplot2::theme_light() +
ggplot2::theme(axis.title.x=ggplot2::element_blank(),
axis.text.x=ggtext::element_markdown(face="bold", color=xcolors))
grDevices::pdf(file="test.pdf", height=3, width=10)
print(P)
grDevices::dev.off()
which produces:
Now I have an associated aminoacid sequence of length 4, for which I know the start and end positions in the DNA sequence. Each letter of the aminoacid sequence corresponds to 3 letters in the DNA sequence.
aaseq <- "WXYZ"
start <- 5
end <- 17
Here the aminoacid sequence WXYZ
starts on T-5
and ends on C-17
of the DNA sequence above, and I want to plot them together.
This would be my ultimate goal (it could be just squares instead of “arrows”):
Is there an easy way to accomplish this in ggplot2
?
An easy option without the arrows would be to use a geom_segment
:
geom_segment(
data = df_arrows,
aes(x = x, xend = xend, y = 16, yend = 16, color = I(color)),
linewidth = 8
)
But if you want the arrows then I would suggest to go for geom_polygon
which however requires some effort to create a dataframe with the coordinates for the polygon:
library(ggplot2)
library(dplyr)
aaseq <- "WXYZ"
start <- 5
end <- 17
df_arrows <- data.frame(
x = indices[seq(start, end - 3, 3)],
xend = indices[seq(start + 3, end, 3)],
y = 16, yend = 16,
color = c("blue", "green", "orange", "purple"),
label = strsplit(aaseq, "")[[1]]
)
df_polygon <- df_arrows |>
dplyr::mutate(label = factor(label, rev(unique(label)))) |>
dplyr::reframe(
data.frame(
x = c(x, xend, xend, xend, x) + c(0, 0, 4, 0, 0),
y = y + .5 * c(1, 1, 0, -1, -1),
color = color,
label = label
),
.by = label
)
ggplot(final_df, aes(x = time, y = value)) +
scale_x_continuous(breaks = indices, labels = seqsplit) +
scale_y_continuous(limits = c(5, 17)) +
theme_light() +
theme(
axis.title.x = element_blank(),
axis.text.x = ggtext::element_markdown(face = "bold", color = xcolors)
) +
annotate(
"rect",
xmin = indices[start], xmax = indices[end],
ymin = -Inf, ymax = Inf,
fill = "grey", alpha = .4
) +
geom_polygon(
data = df_polygon,
aes(x = x, y = y, fill = I(color), group = label)
) +
geom_text(
data = df_arrows,
aes(x = (x + xend) / 2 + 1, y = y, label = label),
color = "white", fontface = "bold"
) +
geom_line(linewidth = 0.5)
4
Based on @stefan answer, I figured I still wanted the arrow direction, but the polygon got so complicated with my real data…
This would go into a Shiny app that takes in different data as input, and the y
values can vary a lot, so defining polygon could become a mess.
I played around with the arrow
argument of geom_segment
, but I did not get what I wanted. However, simply plotting geom_point
at xend
with the diamond shape and a size equal to the linewidth
in geom_segment
does the perfect trick in a very simple way.
myseq <- "AGAATATTATACATTCATCT"
set.seed(123)
mydata <- data.frame(time=1:100, value=rnorm(100, mean=10, sd=2))
indexes <- seq(5, 100, length.out=20)
seqsplit <- unlist(strsplit(myseq, ""))
ind_df <- data.frame(call=seqsplit, time=indexes)
final_df <- dplyr::left_join(mydata, ind_df, by="time")
xcolors <- ifelse(seqsplit=="A", "green", ifelse(seqsplit=="C", "blue", ifelse(seqsplit=="G", "black", "red")))
#
#aa sequence
aaseq <- "WXYZ"
start <- 5
end <- 17
df_arrows <- data.frame(x=indexes[seq(start, end-3, 3)] -2.5,
xend=indexes[seq(start+3, end, 3)] -2.5,
y=16, yend=16,
color=c("blue", "green", "orange", "purple"),
label=strsplit(aaseq, "")[[1]])
##
P <- ggplot2::ggplot(final_df, ggplot2::aes(x=time, y=value)) +
ggplot2::annotate("rect", xmin=indexes[start]-2.5, xmax=indexes[end]-2.5, ymin=-Inf, ymax=Inf, fill="grey", alpha=0.25) +
ggplot2::geom_vline(data=df_arrows[-1,], ggplot2::aes(xintercept=x), color="grey", linetype=2, linewidth=0.5) +
ggplot2::geom_line(linewidth=0.5) +
ggplot2::scale_x_continuous(breaks=indexes, labels=seqsplit) +
ggplot2::scale_y_continuous(limits=c(5,17)) +
ggplot2::geom_segment(data=df_arrows, ggplot2::aes(x=x, xend=xend, y=16, yend=16, color=I(color)), linewidth=8) +
ggplot2::geom_point(data=df_arrows, ggplot2::aes(x=xend, y=16, color=I(color)), shape=18, size=8) +
ggplot2::geom_text(data=df_arrows, ggplot2::aes(x=(x+xend)/2+1, y=y, label=label), color="white", fontface="bold") +
ggplot2::theme_light() +
ggplot2::theme(axis.title.x=ggplot2::element_blank(),
axis.text.x=ggtext::element_markdown(face="bold", color=xcolors))
grDevices::pdf(file="test.pdf", height=3, width=10)
print(P)
grDevices::dev.off()
which produces: