This figure contains data from two ORBIT experiments where mutant libraries were created with mixtures of targeting oligos. Data figures were made in R notebooks and exported as pdfs. Cosmetic improvements were made in Adobe Illustrator. Note that diagrams for Figures 7A, & 7C were made in Adobe Illustrator.
Setup packages and plotting for the notebook:
# Check packages
source("../tools/package_setup.R")
# Load packages
library(tidyverse)
library(cowplot)
library(kableExtra)
library(ggrastr)
# Code display options
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=FALSE, echo = TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.retina = 2)
# Load plotting tools
source("../tools/plotting_tools.R")
#Modify the plot theme
theme_set(theme_notebook())
Read in the data for the starcode filtered barcode counts and filter
for at least 4 counts. See
code/seq_data_processing/galK_BC_16N/galK_BC_16N_processing_analysis.Rmd
for how we got from raw sequencing data to this file.
df_starcode <- read_delim("../seq_data_processing/galK_BC_16N/galKBC16N_unique_BC_starcode_counts_d3s.txt",col_names = c('seq','n'))
df_starcode
## # A tibble: 43,419 × 2
## seq n
## <chr> <dbl>
## 1 TTTTTTTTTTTTTTTT 26953
## 2 AAAAAAAAAAAAAAAA 12219
## 3 GGGGGGGGGAGGGTGT 6872
## 4 CCCCCTTCCCCCTCCC 2939
## 5 GGGGGGGTGATTGGGT 1032
## 6 TTGTTTGTTGGTTTTT 986
## 7 GGTGAGGGGGGGGTGG 904
## 8 GGGGGGGAGGGGGTAG 801
## 9 TGGTTGTTTTGTTTGT 800
## 10 TTTGGTTCGTTTTTTT 797
## # … with 43,409 more rows
Now, let’s create the plot for all barcodes that have at least 4 counts.
## # A tibble: 29,973 × 2
## seq n
## <chr> <dbl>
## 1 TTTTTTTTTTTTTTTT 26953
## 2 AAAAAAAAAAAAAAAA 12219
## 3 GGGGGGGGGAGGGTGT 6872
## 4 CCCCCTTCCCCCTCCC 2939
## 5 GGGGGGGTGATTGGGT 1032
## 6 TTGTTTGTTGGTTTTT 986
## 7 GGTGAGGGGGGGGTGG 904
## 8 GGGGGGGAGGGGGTAG 801
## 9 TGGTTGTTTTGTTTGT 800
## 10 TTTGGTTCGTTTTTTT 797
## # … with 29,963 more rows
To create the figure for the test loci, we will read in all right and left side reads and the TO coordinates.
df_TOs <- read_csv("../../data/seq_data/left_side_TO_libs/df_TOs.csv")
df_read_nearest_down <- read_csv('../../data/seq_data/left_side_TO_libs/df_read_nearest_left.csv')
df_read_nearest_up <- read_csv('../../data/seq_data/right_side_TO_libs/df_read_nearest_right.csv')
df_read_nearest_up_down <- bind_rows(df_read_nearest_up %>% mutate(up_down = 'up'),df_read_nearest_down %>% mutate(up_down = 'down')) %>%
filter(min_val <= 500) %>% #remove all off target reads more than 500 bp away from TO (considered off target)
mutate(perfect = ifelse(min_val == 0, T, F))
df_TOs
## # A tibble: 420 × 7
## lib gene left_oligo_pos right_oligo_pos oligo product go_term
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 alpA 2758649 2758836 CTCTCTGCAACCAAAGT… CP4-57… DNA bi…
## 2 1 argR 3384708 3385153 GCACAATAATGTTGTAT… DNA-bi… DNA bi…
## 3 1 ariR 1216374 1216615 AAACTTATACTTAATAA… putati… protei…
## 4 1 arsR 3648533 3648861 TCGAAGAGAGACACTAC… DNA-bi… transc…
## 5 1 asnC 3926565 3926998 TAAAATAGAATGAATCA… DNA-bi… protei…
## 6 1 bolA 454477 454769 GCGCCGACAGTTGCAAA… DNA-bi… protei…
## 7 1 caiF 34305 34675 ACGCCCGGTATCAACGA… DNA-bi… DNA-bi…
## 8 1 cedA 1813441 1813658 GCAAAACCGGGAGGTAT… cell d… DNA bi…
## 9 1 cspA 3720054 3720241 TCGCCGAAAGGCACACT… cold s… nuclei…
## 10 1 cspC 1907246 1907430 CACACTTCAGATCAGTG… stress… nuclei…
## # … with 410 more rows
## # A tibble: 522,522 × 14
## seqnames strand cigar qwidth start end width njunc lib read_…¹ gene_id
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 U00096.3 - 79M2S 81 2978946 2.98e6 79 0 0 2979024 lysR
## 2 U00096.3 - 6S47M 53 2095079 2.10e6 47 0 0 2095125 hisA
## 3 U00096.3 + 81M 81 4277913 4.28e6 81 0 0 4277913 soxR
## 4 U00096.3 - 76M 76 3535810 3.54e6 76 0 0 3535885 ompR
## 5 U00096.3 + 2S78M 80 317133 3.17e5 78 0 0 317133 ykgA
## 6 U00096.3 - 51M 51 3059708 3.06e6 51 0 0 3059758 argP
## 7 U00096.3 - 2S30M 32 2095096 2.10e6 30 0 0 2095125 hisA
## 8 U00096.3 - 81M 81 3535805 3.54e6 81 0 0 3535885 ompR
## 9 U00096.3 + 75M 75 4099474 4.10e6 75 0 0 4099474 rhaR
## 10 U00096.3 + 34M2S 36 417811 4.18e5 34 0 0 417811 phoB
## # … with 522,512 more rows, 3 more variables: min_val <dbl>, up_down <chr>,
## # perfect <lgl>, and abbreviated variable name ¹read_start_pos
To plot gene diagrams we will load in the coordinates for all E. coli genes and add the gene coordaintes for each TO.
df_genes <- read_delim("../../../ecoli_ORBIT_paper_final/data/seq_data/All_instances_of_Genes_in_Escherichia_coli_K-12_substr._MG1655.txt") %>%
mutate(left_pos = `Left-End-Position`, right_pos = `Right-End-Position`) %>%
select(-`Left-End-Position`, -`Right-End-Position`) %>%
mutate(fwd = ifelse(Direction =='+', T, F))
df_genes_lib <- left_join(df_TOs %>% select(lib, 'gene_id'='gene'), df_genes, by = c('gene_id'='Genes'))
df_genes_lib
## # A tibble: 420 × 7
## lib gene_id Direction Product left_…¹ right…² fwd
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <lgl>
## 1 1 alpA + CP4-57 prophage; DNA-binding t… 2758644 2758856 TRUE
## 2 1 argR + DNA-binding transcriptional du… 3384703 3385173 TRUE
## 3 1 ariR + putative two-component system … 1216369 1216635 TRUE
## 4 1 arsR + DNA-binding transcriptional re… 3648528 3648881 TRUE
## 5 1 asnC - DNA-binding transcriptional du… 3926545 3927003 FALSE
## 6 1 bolA + DNA-binding transcriptional du… 454472 454789 TRUE
## 7 1 caiF + DNA-binding transcriptional ac… 34300 34695 TRUE
## 8 1 cedA - cell division modulator 1813421 1813663 FALSE
## 9 1 cspA + cold shock protein CspA 3720049 3720261 TRUE
## 10 1 cspC - stress protein, member of the … 1907226 1907435 FALSE
## # … with 410 more rows, and abbreviated variable names ¹left_pos, ²right_pos
Then we will manually define the 6 points on the gene arrow (for plotting purposes only).
df_poly <- df_genes_lib %>%
#filter(right_pos<20000) %>%
pivot_longer(c(left_pos, right_pos), values_to = 'x_pos', names_to = 'left_right') %>%
mutate( y_pos_bottom = 10,y_pos_top = 100, y_pos_center=31.623) %>%
pivot_longer(c(y_pos_bottom,y_pos_top, y_pos_center), values_to = 'y_pos', names_to = 'top_bottom') %>%
mutate(x_pos = ifelse(left_right == 'left_pos' & Direction == '-' & top_bottom !='y_pos_center', x_pos + 50, x_pos)) %>%
mutate(x_pos = ifelse(left_right == 'right_pos' & Direction == '+' & top_bottom !='y_pos_center', x_pos - 50, x_pos)) %>%
mutate(point_num = case_when(left_right == 'left_pos' & top_bottom == 'y_pos_bottom' ~ 1,
left_right == 'left_pos' & top_bottom == 'y_pos_center' ~ 2,
left_right == 'left_pos' & top_bottom == 'y_pos_top' ~ 3,
left_right == 'right_pos' & top_bottom == 'y_pos_top' ~ 4,
left_right == 'right_pos' & top_bottom == 'y_pos_center' ~ 5,
left_right == 'right_pos' & top_bottom == 'y_pos_bottom' ~ 6)) %>%
arrange(gene_id, point_num)
df_poly
## # A tibble: 2,520 × 10
## lib gene_id Direc…¹ Product fwd left_…² x_pos top_b…³ y_pos point…⁴
## <dbl> <chr> <chr> <chr> <lgl> <chr> <dbl> <chr> <dbl> <dbl>
## 1 3 3'ETS-<i>le… - small … FALSE left_p… 1.99e6 y_pos_… 10 1
## 2 3 3'ETS-<i>le… - small … FALSE left_p… 1.99e6 y_pos_… 31.6 2
## 3 3 3'ETS-<i>le… - small … FALSE left_p… 1.99e6 y_pos_… 100 3
## 4 3 3'ETS-<i>le… - small … FALSE right_… 1.99e6 y_pos_… 100 4
## 5 3 3'ETS-<i>le… - small … FALSE right_… 1.99e6 y_pos_… 31.6 5
## 6 3 3'ETS-<i>le… - small … FALSE right_… 1.99e6 y_pos_… 10 6
## 7 3 C0293 + small … TRUE left_p… 1.20e6 y_pos_… 10 1
## 8 3 C0293 + small … TRUE left_p… 1.20e6 y_pos_… 31.6 2
## 9 3 C0293 + small … TRUE left_p… 1.20e6 y_pos_… 100 3
## 10 3 C0293 + small … TRUE right_… 1.20e6 y_pos_… 100 4
## # … with 2,510 more rows, and abbreviated variable names ¹Direction,
## # ²left_right, ³top_bottom, ⁴point_num
Finally we will arrange the reads so they plot nicely.
df_read_nearest_up_down_org <- df_read_nearest_up_down %>%
arrange(lib, gene_id, up_down, perfect, width ) %>%
group_by(lib, gene_id, up_down, perfect) %>%
mutate(read_num = row_number())
df_read_nearest_up_down_org
## # A tibble: 522,522 × 15
## # Groups: lib, gene_id, up_down, perfect [1,322]
## seqnames strand cigar qwidth start end width njunc lib read_…¹ gene_id
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 U00096.3 - 7S27M 34 485734 485760 27 0 0 485760 acrR
## 2 U00096.3 - 33M34S 67 485693 485725 33 0 0 485725 acrR
## 3 U00096.3 - 27S46… 75 485719 485764 46 0 0 485764 acrR
## 4 U00096.3 - 1S49M 50 485712 485760 49 0 0 485760 acrR
## 5 U00096.3 - 58M 58 485703 485760 58 0 0 485760 acrR
## 6 U00096.3 - 5S65M 70 485696 485760 65 0 0 485760 acrR
## 7 U00096.3 - 1S67M 68 485694 485760 67 0 0 485760 acrR
## 8 U00096.3 - 68M 68 485693 485760 68 0 0 485760 acrR
## 9 U00096.3 - 70M 70 485691 485760 70 0 0 485760 acrR
## 10 U00096.3 - 23M 23 485744 485766 23 0 0 485766 acrR
## # … with 522,512 more rows, 4 more variables: min_val <dbl>, up_down <chr>,
## # perfect <lgl>, read_num <int>, and abbreviated variable name
## # ¹read_start_pos
Putting it all together for our 4 test genes, we will rasterize the read layer of the plot, so it doesn’t overwhelm adobe illustrator.
kb_label <- function(x){
# from s to ns
lab <- paste(x / 1000)
}
gene_list <- c('galK', 'hisA','metA', 'leuD')
plot_gene_examples <- ggplot()+
geom_polygon(data = df_poly %>% filter(lib==0 & gene_id %in% gene_list)%>% mutate(gene_id = factor(gene_id, levels = gene_list)), aes(x = x_pos, y = y_pos, group = gene_id),fill = 'light gray')+
rasterise(geom_segment(data = df_read_nearest_up_down_org %>% filter(lib==0 & gene_id %in% gene_list) %>% filter(min_val <500) %>% mutate(gene_id = factor(gene_id, levels = gene_list)),
aes(x = start, xend = end, y = read_num, yend = read_num, color = perfect), alpha = 0.25, linewidth = 0.5), dpi = 2000, dev = "cairo")+
labs(x = "Genomic position (Kb)", y = 'Read count')+
scale_x_continuous(labels = kb_label)+
scale_y_log10(limits =c(1, NA))+
scale_color_manual(values = c("#440154FF","#21908CFF"))+
guides(fill = 'none')+
facet_wrap(~gene_id, scales = 'free', ncol = 1,strip.position = 'top') + guides(color = 'none')
plot_gene_examples
Lastly, we can count the perfect rate for the reads in the figure (within 500bp of TO side.)
df_read_nearest_up_down_org %>%
filter(lib==0 & gene_id %in% gene_list) %>%
group_by(gene_id, up_down) %>%
summarise(n=n(), perfect = sum(perfect), max_val = max(min_val))%>%
mutate(per_perfect = perfect / n)
## # A tibble: 8 × 6
## # Groups: gene_id [4]
## gene_id up_down n perfect max_val per_perfect
## <chr> <chr> <int> <int> <dbl> <dbl>
## 1 galK down 2040 2012 6 0.986
## 2 galK up 3268 3251 10 0.995
## 3 hisA down 2476 2467 3 0.996
## 4 hisA up 3046 3038 2 0.997
## 5 leuD down 31 30 2 0.968
## 6 leuD up 74 74 0 1
## 7 metA down 298 295 3 0.990
## 8 metA up 505 503 4 0.996
theme_set(theme_figure())
fig_7_bc_opool <- plot_grid(plot_bc, plot_gene_examples, ncol = 1,
align = 'hv', axis = 'lr', scale = 1,
labels = c('B','D'), rel_heights = c(1,2))
fig_7_bc_opool
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggrastr_1.0.1 kableExtra_1.3.4 cowplot_1.1.1 viridis_0.6.2
## [5] viridisLite_0.4.1 knitr_1.41 forcats_0.5.2 stringr_1.5.0
## [9] dplyr_1.1.0 purrr_0.3.5 readr_2.1.3 tidyr_1.2.1
## [13] tibble_3.1.8 ggplot2_3.4.0 tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.4 sass_0.4.4 bit64_4.0.5
## [4] vroom_1.6.0 jsonlite_1.8.3 modelr_0.1.10
## [7] bslib_0.4.1 assertthat_0.2.1 highr_0.9
## [10] vipor_0.4.5 googlesheets4_1.0.1 cellranger_1.1.0
## [13] yaml_2.3.6 pillar_1.8.1 backports_1.4.1
## [16] glue_1.6.2 digest_0.6.30 rvest_1.0.3
## [19] colorspace_2.0-3 htmltools_0.5.4 pkgconfig_2.0.3
## [22] broom_1.0.1 haven_2.5.1 scales_1.2.1
## [25] webshot_0.5.4 svglite_2.1.0 tzdb_0.3.0
## [28] timechange_0.1.1 googledrive_2.0.0 generics_0.1.3
## [31] farver_2.1.1 ellipsis_0.3.2 cachem_1.0.6
## [34] withr_2.5.0 cli_3.4.1 magrittr_2.0.3
## [37] crayon_1.5.2 readxl_1.4.1 evaluate_0.18
## [40] fs_1.5.2 fansi_1.0.3 xml2_1.3.3
## [43] beeswarm_0.4.0 Cairo_1.6-0 tools_4.2.0
## [46] hms_1.1.2 gargle_1.2.1 lifecycle_1.0.3
## [49] munsell_0.5.0 reprex_2.0.2 compiler_4.2.0
## [52] jquerylib_0.1.4 systemfonts_1.0.4 rlang_1.0.6
## [55] grid_4.2.0 rstudioapi_0.14 labeling_0.4.2
## [58] rmarkdown_2.18 gtable_0.3.1 DBI_1.1.3
## [61] R6_2.5.1 gridExtra_2.3 lubridate_1.9.0
## [64] fastmap_1.1.0 bit_4.0.5 utf8_1.2.2
## [67] stringi_1.7.8 ggbeeswarm_0.7.1 parallel_4.2.0
## [70] vctrs_0.5.2 dbplyr_2.2.1 tidyselect_1.2.0
## [73] xfun_0.35