This figure contains data from many different experiments that were used to optimize the protocol for ORBIT.
Setup packages and plotting for the notebook:
# Check packages
source("../tools/package_setup.R")
# Load packages
library(tidyverse)
library(cowplot)
library(kableExtra)
library(ggrastr)
# Code display options
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=FALSE, echo = TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.retina = 2)
# Load plotting tools
source("../tools/plotting_tools.R")
#Modify the plot theme
theme_set(theme_figure())
df_read_nearest_down <- read_csv('../../data/seq_data/left_side_TO_libs/df_read_nearest_left.csv')
df_read_nearest_up <- read_csv('../../data/seq_data/right_side_TO_libs/df_read_nearest_right.csv')
df_read_nearest_up_down <- bind_rows(df_read_nearest_up %>% mutate(up_down = 'up'),df_read_nearest_down %>% mutate(up_down = 'down')) %>%
mutate(off_target = ifelse(min_val >500, T, F)) %>%
mutate(perfect = ifelse(min_val == 0, T, F))
df_read_nearest_up_down
## # A tibble: 536,686 × 15
## seqnames strand cigar qwidth start end width njunc lib read_…¹ gene_id
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 U00096.3 - 79M2S 81 2978946 2.98e6 79 0 0 2979024 lysR
## 2 U00096.3 - 6S47M 53 2095079 2.10e6 47 0 0 2095125 hisA
## 3 U00096.3 + 81M 81 4277913 4.28e6 81 0 0 4277913 soxR
## 4 U00096.3 - 76M 76 3535810 3.54e6 76 0 0 3535885 ompR
## 5 U00096.3 + 2S78M 80 317133 3.17e5 78 0 0 317133 ykgA
## 6 U00096.3 - 51M 51 3059708 3.06e6 51 0 0 3059758 argP
## 7 U00096.3 - 2S30M 32 2095096 2.10e6 30 0 0 2095125 hisA
## 8 U00096.3 - 81M 81 3535805 3.54e6 81 0 0 3535885 ompR
## 9 U00096.3 + 75M 75 4099474 4.10e6 75 0 0 4099474 rhaR
## 10 U00096.3 + 34M2S 36 417811 4.18e5 34 0 0 417811 phoB
## # … with 536,676 more rows, 4 more variables: min_val <dbl>, up_down <chr>,
## # off_target <lgl>, perfect <lgl>, and abbreviated variable name
## # ¹read_start_pos
df_read_nearest_up_down_org <- df_read_nearest_up_down %>%
arrange(lib,gene_id, up_down, perfect, width ) %>%
group_by(lib,gene_id, up_down, perfect) %>%
mutate(read_num = row_number())
df_read_nearest_up_down_org
## # A tibble: 536,686 × 16
## # Groups: lib, gene_id, up_down, perfect [1,485]
## seqnames strand cigar qwidth start end width njunc lib read_s…¹ gene_id
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 U00096.3 - 22M 22 568000 568021 22 0 0 568021 acrR
## 2 U00096.3 - 22M 22 568000 568021 22 0 0 568021 acrR
## 3 U00096.3 - 22M 22 568000 568021 22 0 0 568021 acrR
## 4 U00096.3 - 22M 22 568000 568021 22 0 0 568021 acrR
## 5 U00096.3 - 22M 22 568000 568021 22 0 0 568021 acrR
## 6 U00096.3 - 22M 22 568000 568021 22 0 0 568021 acrR
## 7 U00096.3 - 22M 22 568000 568021 22 0 0 568021 acrR
## 8 U00096.3 - 22M 22 568000 568021 22 0 0 568021 acrR
## 9 U00096.3 - 22M 22 568000 568021 22 0 0 568021 acrR
## 10 U00096.3 - 23M 23 508201 508223 23 0 0 508223 acrR
## # … with 536,676 more rows, 5 more variables: min_val <dbl>, up_down <chr>,
## # off_target <lgl>, perfect <lgl>, read_num <int>, and abbreviated variable
## # name ¹read_start_pos
## # A tibble: 420 × 7
## lib gene left_oligo_pos right_oligo_pos oligo product go_term
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 alpA 2758649 2758836 CTCTCTGCAACCAAAGT… CP4-57… DNA bi…
## 2 1 argR 3384708 3385153 GCACAATAATGTTGTAT… DNA-bi… DNA bi…
## 3 1 ariR 1216374 1216615 AAACTTATACTTAATAA… putati… protei…
## 4 1 arsR 3648533 3648861 TCGAAGAGAGACACTAC… DNA-bi… transc…
## 5 1 asnC 3926565 3926998 TAAAATAGAATGAATCA… DNA-bi… protei…
## 6 1 bolA 454477 454769 GCGCCGACAGTTGCAAA… DNA-bi… protei…
## 7 1 caiF 34305 34675 ACGCCCGGTATCAACGA… DNA-bi… DNA-bi…
## 8 1 cedA 1813441 1813658 GCAAAACCGGGAGGTAT… cell d… DNA bi…
## 9 1 cspA 3720054 3720241 TCGCCGAAAGGCACACT… cold s… nuclei…
## 10 1 cspC 1907246 1907430 CACACTTCAGATCAGTG… stress… nuclei…
## # … with 410 more rows
df_genes <- read_delim("../../data/seq_data/All_instances_of_Genes_in_Escherichia_coli_K-12_substr._MG1655.txt") %>%
mutate(left_pos = `Left-End-Position`, right_pos = `Right-End-Position`) %>%
select(-`Left-End-Position`, -`Right-End-Position`) %>%
mutate(fwd = ifelse(Direction =='+', T, F))
df_genes_lib <- left_join(df_TOs %>% select(lib, 'gene_id'='gene'), df_genes, by = c('gene_id'='Genes'))
df_genes_lib
## # A tibble: 420 × 7
## lib gene_id Direction Product left_…¹ right…² fwd
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <lgl>
## 1 1 alpA + CP4-57 prophage; DNA-binding t… 2758644 2758856 TRUE
## 2 1 argR + DNA-binding transcriptional du… 3384703 3385173 TRUE
## 3 1 ariR + putative two-component system … 1216369 1216635 TRUE
## 4 1 arsR + DNA-binding transcriptional re… 3648528 3648881 TRUE
## 5 1 asnC - DNA-binding transcriptional du… 3926545 3927003 FALSE
## 6 1 bolA + DNA-binding transcriptional du… 454472 454789 TRUE
## 7 1 caiF + DNA-binding transcriptional ac… 34300 34695 TRUE
## 8 1 cedA - cell division modulator 1813421 1813663 FALSE
## 9 1 cspA + cold shock protein CspA 3720049 3720261 TRUE
## 10 1 cspC - stress protein, member of the … 1907226 1907435 FALSE
## # … with 410 more rows, and abbreviated variable names ¹left_pos, ²right_pos
df_poly <- df_genes_lib %>%
#filter(right_pos<20000) %>%
pivot_longer(c(left_pos, right_pos), values_to = 'x_pos', names_to = 'left_right') %>%
mutate( y_pos_bottom = 10,y_pos_top = 100, y_pos_center=31.623) %>%
pivot_longer(c(y_pos_bottom,y_pos_top, y_pos_center), values_to = 'y_pos', names_to = 'top_bottom') %>%
mutate(x_pos = ifelse(left_right == 'left_pos' & Direction == '-' & top_bottom !='y_pos_center', x_pos + 50, x_pos)) %>%
mutate(x_pos = ifelse(left_right == 'right_pos' & Direction == '+' & top_bottom !='y_pos_center', x_pos - 50, x_pos)) %>%
mutate(point_num = case_when(left_right == 'left_pos' & top_bottom == 'y_pos_bottom' ~ 1,
left_right == 'left_pos' & top_bottom == 'y_pos_center' ~ 2,
left_right == 'left_pos' & top_bottom == 'y_pos_top' ~ 3,
left_right == 'right_pos' & top_bottom == 'y_pos_top' ~ 4,
left_right == 'right_pos' & top_bottom == 'y_pos_center' ~ 5,
left_right == 'right_pos' & top_bottom == 'y_pos_bottom' ~ 6)) %>%
arrange(gene_id, point_num)
df_poly
## # A tibble: 2,520 × 10
## lib gene_id Direc…¹ Product fwd left_…² x_pos top_b…³ y_pos point…⁴
## <dbl> <chr> <chr> <chr> <lgl> <chr> <dbl> <chr> <dbl> <dbl>
## 1 3 3'ETS-<i>le… - small … FALSE left_p… 1.99e6 y_pos_… 10 1
## 2 3 3'ETS-<i>le… - small … FALSE left_p… 1.99e6 y_pos_… 31.6 2
## 3 3 3'ETS-<i>le… - small … FALSE left_p… 1.99e6 y_pos_… 100 3
## 4 3 3'ETS-<i>le… - small … FALSE right_… 1.99e6 y_pos_… 100 4
## 5 3 3'ETS-<i>le… - small … FALSE right_… 1.99e6 y_pos_… 31.6 5
## 6 3 3'ETS-<i>le… - small … FALSE right_… 1.99e6 y_pos_… 10 6
## 7 3 C0293 + small … TRUE left_p… 1.20e6 y_pos_… 10 1
## 8 3 C0293 + small … TRUE left_p… 1.20e6 y_pos_… 31.6 2
## 9 3 C0293 + small … TRUE left_p… 1.20e6 y_pos_… 100 3
## 10 3 C0293 + small … TRUE right_… 1.20e6 y_pos_… 100 4
## # … with 2,510 more rows, and abbreviated variable names ¹Direction,
## # ²left_right, ³top_bottom, ⁴point_num
kb_label <- function(x){
# from s to ns
lab <- paste(x / 1000)
}
plot_all <- ggplot()+
geom_rect(data = df_TOs %>% select(lib, gene_id = gene, left_oligo_pos, right_oligo_pos) %>% filter( lib == 1),
aes(xmin = left_oligo_pos, xmax = right_oligo_pos, ymin = 0, ymax = 50), fill = 'light gray')+
geom_segment(data = df_read_nearest_down %>% filter(lib==1) %>% filter(min_val <500),
aes(x = start, xend = end, y = width, yend = width ),
#position = position_jitter(height = 75),
alpha = 0.01)+
geom_segment(data = df_read_nearest_up%>% filter(lib==1) %>% filter(min_val <500),
aes(x = start, xend = end, y = width, yend = width ),
#position = position_jitter(height = 75),
alpha = 0.01)+
scale_x_continuous(labels = scales::label_comma())+
scale_y_continuous(breaks = c(0,40,80), limits = c(0,100))+
labs(x = "Genomic position (bp)", y = 'Mapped read lengths (bp)')+
guides(fill = 'none')+
facet_wrap(~gene_id, scales = 'free', ncol = 8,strip.position = 'top')
plot_all <- ggplot()+
geom_polygon(data = df_poly %>% filter(gene_id %in% c('alpA','argR','arsR')), aes(x = x_pos, y = y_pos, group = gene_id),fill = 'light gray')+
geom_segment(data = df_read_nearest_up_down_org%>% filter(gene_id %in% c('alpA','argR','arsR')) %>% filter(min_val <500),
aes(x = start, xend = end, y = read_num, yend = read_num, color = perfect), alpha = 0.25)+
labs(x = "Genomic position (Kb)", y = 'Mapped read lengths (bp)')+
scale_x_continuous(labels = kb_label)+
scale_y_log10()+
guides(fill = 'none')+
facet_wrap(~gene_id, scales = 'free_x', ncol = 8,strip.position = 'top')
plot_lib_0 <- ggplot()+
geom_polygon(data = df_poly %>% filter(lib == 0), aes(x = x_pos, y = y_pos, group = gene_id),fill = 'light gray')+
rasterise(geom_segment(data = df_read_nearest_up_down_org%>% filter(lib == 0) %>% filter(min_val <500),
aes(x = start, xend = end, y = read_num, yend = read_num, color = perfect), alpha = 0.25, linewidth = 0.5), dpi = 2000, dev = "cairo")+
labs(x = "Genomic position (Kb)", y = 'Read_count')+
scale_x_continuous(labels = kb_label,breaks = scales::extended_breaks(n=3))+
scale_y_log10(limits =c(1, NA))+
guides(fill = 'none')+
facet_wrap(~gene_id, scales = 'free', ncol = 8,strip.position = 'top') + guides(color = 'none')
plot_lib_0
plot_lib_1 <- ggplot()+
geom_polygon(data = df_poly %>% filter(lib == 1), aes(x = x_pos, y = y_pos, group = gene_id),fill = 'light gray')+
rasterise(geom_segment(data = df_read_nearest_up_down_org%>% filter(lib == 1) %>% filter(min_val <500),
aes(x = start, xend = end, y = read_num, yend = read_num, color = perfect), alpha = 0.25, linewidth = 0.5), dpi = 2000, dev = "cairo")+
labs(x = "Genomic position (Kb)", y = 'Read count')+
scale_x_continuous(labels = kb_label,breaks = scales::extended_breaks(n=3))+
scale_y_log10(limits =c(1, NA))+
guides(fill = 'none')+
facet_wrap(~gene_id, scales = 'free', ncol = 8,strip.position = 'top') + guides(color = 'none')
plot_lib_1
Note this plot is so large, that the Rmd notebook seems to have trouble displaying it, however the pdf output works fine.
plot_lib_2 <- ggplot()+
geom_polygon(data = df_poly %>% filter(lib == 2), aes(x = x_pos, y = y_pos, group = gene_id),fill = 'light gray')+
rasterise(
geom_segment(data = df_read_nearest_up_down_org%>% filter(lib == 2) %>% filter(min_val <500),
aes(x = start, xend = end, y = read_num, yend = read_num, color = perfect), alpha = 0.25, linewidth = 0.5),
dpi = 2000, dev = "cairo")+
labs(x = "Genomic position (Kb)", y = 'Read_count')+
scale_x_continuous(labels = kb_label,breaks = scales::extended_breaks(n=3))+
scale_y_log10(limits =c(1, NA))+
guides(fill = 'none')+
facet_wrap(~gene_id, scales = 'free', ncol = 8,strip.position = 'top') + guides(color = 'none')
#plot_lib_2
#save_plot("../../figures/r_pdf_figs/supp_figs/plot_lib_2.pdf",plot_lib_2, base_height = 20, base_width = 8)
plot_lib_3 <- ggplot()+
geom_polygon(data = df_poly %>% filter(lib == 3), aes(x = x_pos, y = y_pos, group = gene_id),fill = 'light gray')+
rasterise(geom_segment(data = df_read_nearest_up_down_org%>% filter(lib == 3) %>% filter(min_val <500),
aes(x = start, xend = end, y = read_num, yend = read_num, color = perfect), alpha = 0.25, linewidth = 0.5), dpi = 2000, dev = "cairo")+
labs(x = "Genomic position (Kb)", y = 'Read_count')+
scale_x_continuous(labels = kb_label,breaks = scales::extended_breaks(n=3))+
scale_y_log10(limits =c(1, NA))+
guides(fill = 'none')+
facet_wrap(~gene_id, scales = 'free', ncol = 8,strip.position = 'top') + guides(color = 'none')
plot_lib_3
#save_plot("../../figures/r_pdf_figs/supp_figs/plot_lib_3.pdf",plot_lib_3, base_height = 10, base_width = 8)
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggrastr_1.0.1 kableExtra_1.3.4 cowplot_1.1.1 viridis_0.6.2
## [5] viridisLite_0.4.1 knitr_1.41 forcats_0.5.2 stringr_1.5.0
## [9] dplyr_1.1.0 purrr_0.3.5 readr_2.1.3 tidyr_1.2.1
## [13] tibble_3.1.8 ggplot2_3.4.0 tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.4 sass_0.4.4 bit64_4.0.5
## [4] vroom_1.6.0 jsonlite_1.8.3 modelr_0.1.10
## [7] bslib_0.4.1 assertthat_0.2.1 highr_0.9
## [10] vipor_0.4.5 googlesheets4_1.0.1 cellranger_1.1.0
## [13] yaml_2.3.6 pillar_1.8.1 backports_1.4.1
## [16] glue_1.6.2 digest_0.6.30 rvest_1.0.3
## [19] colorspace_2.0-3 htmltools_0.5.4 pkgconfig_2.0.3
## [22] broom_1.0.1 haven_2.5.1 scales_1.2.1
## [25] webshot_0.5.4 svglite_2.1.0 tzdb_0.3.0
## [28] timechange_0.1.1 googledrive_2.0.0 farver_2.1.1
## [31] generics_0.1.3 ellipsis_0.3.2 cachem_1.0.6
## [34] withr_2.5.0 cli_3.4.1 magrittr_2.0.3
## [37] crayon_1.5.2 readxl_1.4.1 evaluate_0.18
## [40] fs_1.5.2 fansi_1.0.3 xml2_1.3.3
## [43] beeswarm_0.4.0 Cairo_1.6-0 tools_4.2.0
## [46] hms_1.1.2 gargle_1.2.1 lifecycle_1.0.3
## [49] munsell_0.5.0 reprex_2.0.2 compiler_4.2.0
## [52] jquerylib_0.1.4 systemfonts_1.0.4 rlang_1.1.1
## [55] grid_4.2.0 rstudioapi_0.14 labeling_0.4.2
## [58] rmarkdown_2.18 gtable_0.3.1 DBI_1.1.3
## [61] R6_2.5.1 gridExtra_2.3 lubridate_1.9.0
## [64] fastmap_1.1.0 bit_4.0.5 utf8_1.2.2
## [67] stringi_1.7.8 ggbeeswarm_0.7.1 parallel_4.2.0
## [70] vctrs_0.5.2 dbplyr_2.2.1 tidyselect_1.2.0
## [73] xfun_0.35