Notes

This figure contains data from two ORBIT experiments where mutant libraries were created with mixtures of targeting oligos. Data figures were made in R notebooks and exported as pdfs. Cosmetic improvements were made in Adobe Illustrator. Note that diagrams for Figures 7A, & 7C were made in Adobe Illustrator.

Setup packages and plotting for the notebook:

# Check packages
source("../tools/package_setup.R")
# Load packages
library(tidyverse)
library(cowplot)
library(kableExtra)
library(ggrastr)
# Code display options
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=FALSE, echo = TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.retina = 2)
# Load plotting tools
source("../tools/plotting_tools.R")
#Modify the plot theme
theme_set(theme_notebook())

Fig. 7B - Barcode counts

Read in the data for the starcode filtered barcode counts and filter for at least 4 counts. See code/seq_data_processing/galK_BC_16N/galK_BC_16N_processing_analysis.Rmd for how we got from raw sequencing data to this file.

df_starcode <- read_delim("../seq_data_processing/galK_BC_16N/galKBC16N_unique_BC_starcode_counts_d3s.txt",col_names = c('seq','n'))

df_starcode

## # A tibble: 43,419 × 2
##    seq                  n
##    <chr>            <dbl>
##  1 TTTTTTTTTTTTTTTT 26953
##  2 AAAAAAAAAAAAAAAA 12219
##  3 GGGGGGGGGAGGGTGT  6872
##  4 CCCCCTTCCCCCTCCC  2939
##  5 GGGGGGGTGATTGGGT  1032
##  6 TTGTTTGTTGGTTTTT   986
##  7 GGTGAGGGGGGGGTGG   904
##  8 GGGGGGGAGGGGGTAG   801
##  9 TGGTTGTTTTGTTTGT   800
## 10 TTTGGTTCGTTTTTTT   797
## # … with 43,409 more rows

Now, let’s create the plot for all barcodes that have at least 4 counts.

df_starcode %>% filter(n>=4)

## # A tibble: 29,973 × 2
##    seq                  n
##    <chr>            <dbl>
##  1 TTTTTTTTTTTTTTTT 26953
##  2 AAAAAAAAAAAAAAAA 12219
##  3 GGGGGGGGGAGGGTGT  6872
##  4 CCCCCTTCCCCCTCCC  2939
##  5 GGGGGGGTGATTGGGT  1032
##  6 TTGTTTGTTGGTTTTT   986
##  7 GGTGAGGGGGGGGTGG   904
##  8 GGGGGGGAGGGGGTAG   801
##  9 TGGTTGTTTTGTTTGT   800
## 10 TTTGGTTCGTTTTTTT   797
## # … with 29,963 more rows

plot_bc <- ggplot(df_starcode %>% filter(n>=4), aes(x = n)) + 
  geom_histogram(color = 'black', fill = 'light gray', bins = 30, linewidth = 0.25) + 
  scale_x_log10(limits = c(NA,1000)) + scale_y_log10() + 
  #theme_bw() + 
  labs(x = 'Reads', y = 'Unique Barcodes')

plot_bc

Fig. 7D - Example reads at test loci

To create the figure for the test loci, we will read in all right and left side reads and the TO coordinates.

df_TOs <- read_csv("../../data/seq_data/left_side_TO_libs/df_TOs.csv")


df_read_nearest_down <- read_csv('../../data/seq_data/left_side_TO_libs/df_read_nearest_left.csv')
df_read_nearest_up <- read_csv('../../data/seq_data/right_side_TO_libs/df_read_nearest_right.csv')
df_read_nearest_up_down <- bind_rows(df_read_nearest_up %>% mutate(up_down = 'up'),df_read_nearest_down %>%  mutate(up_down = 'down')) %>% 
  filter(min_val <= 500) %>% #remove all off target reads more than 500 bp away from TO (considered off target)
  mutate(perfect = ifelse(min_val == 0, T, F))


df_TOs

## # A tibble: 420 × 7
##      lib gene  left_oligo_pos right_oligo_pos oligo              product go_term
##    <dbl> <chr>          <dbl>           <dbl> <chr>              <chr>   <chr>  
##  1     1 alpA         2758649         2758836 CTCTCTGCAACCAAAGT… CP4-57… DNA bi…
##  2     1 argR         3384708         3385153 GCACAATAATGTTGTAT… DNA-bi… DNA bi…
##  3     1 ariR         1216374         1216615 AAACTTATACTTAATAA… putati… protei…
##  4     1 arsR         3648533         3648861 TCGAAGAGAGACACTAC… DNA-bi… transc…
##  5     1 asnC         3926565         3926998 TAAAATAGAATGAATCA… DNA-bi… protei…
##  6     1 bolA          454477          454769 GCGCCGACAGTTGCAAA… DNA-bi… protei…
##  7     1 caiF           34305           34675 ACGCCCGGTATCAACGA… DNA-bi… DNA-bi…
##  8     1 cedA         1813441         1813658 GCAAAACCGGGAGGTAT… cell d… DNA bi…
##  9     1 cspA         3720054         3720241 TCGCCGAAAGGCACACT… cold s… nuclei…
## 10     1 cspC         1907246         1907430 CACACTTCAGATCAGTG… stress… nuclei…
## # … with 410 more rows

df_read_nearest_up_down

## # A tibble: 522,522 × 14
##    seqnames strand cigar qwidth   start    end width njunc   lib read_…¹ gene_id
##    <chr>    <chr>  <chr>  <dbl>   <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>  
##  1 U00096.3 -      79M2S     81 2978946 2.98e6    79     0     0 2979024 lysR   
##  2 U00096.3 -      6S47M     53 2095079 2.10e6    47     0     0 2095125 hisA   
##  3 U00096.3 +      81M       81 4277913 4.28e6    81     0     0 4277913 soxR   
##  4 U00096.3 -      76M       76 3535810 3.54e6    76     0     0 3535885 ompR   
##  5 U00096.3 +      2S78M     80  317133 3.17e5    78     0     0  317133 ykgA   
##  6 U00096.3 -      51M       51 3059708 3.06e6    51     0     0 3059758 argP   
##  7 U00096.3 -      2S30M     32 2095096 2.10e6    30     0     0 2095125 hisA   
##  8 U00096.3 -      81M       81 3535805 3.54e6    81     0     0 3535885 ompR   
##  9 U00096.3 +      75M       75 4099474 4.10e6    75     0     0 4099474 rhaR   
## 10 U00096.3 +      34M2S     36  417811 4.18e5    34     0     0  417811 phoB   
## # … with 522,512 more rows, 3 more variables: min_val <dbl>, up_down <chr>,
## #   perfect <lgl>, and abbreviated variable name ¹read_start_pos

To plot gene diagrams we will load in the coordinates for all E. coli genes and add the gene coordaintes for each TO.

df_genes <- read_delim("../../../ecoli_ORBIT_paper_final/data/seq_data/All_instances_of_Genes_in_Escherichia_coli_K-12_substr._MG1655.txt") %>% 
  mutate(left_pos = `Left-End-Position`, right_pos = `Right-End-Position`) %>% 
  select(-`Left-End-Position`, -`Right-End-Position`) %>% 
  mutate(fwd = ifelse(Direction =='+', T, F))

df_genes_lib <- left_join(df_TOs %>% select(lib, 'gene_id'='gene'), df_genes, by = c('gene_id'='Genes'))

df_genes_lib

## # A tibble: 420 × 7
##      lib gene_id Direction Product                         left_…¹ right…² fwd  
##    <dbl> <chr>   <chr>     <chr>                             <dbl>   <dbl> <lgl>
##  1     1 alpA    +         CP4-57 prophage; DNA-binding t… 2758644 2758856 TRUE 
##  2     1 argR    +         DNA-binding transcriptional du… 3384703 3385173 TRUE 
##  3     1 ariR    +         putative two-component system … 1216369 1216635 TRUE 
##  4     1 arsR    +         DNA-binding transcriptional re… 3648528 3648881 TRUE 
##  5     1 asnC    -         DNA-binding transcriptional du… 3926545 3927003 FALSE
##  6     1 bolA    +         DNA-binding transcriptional du…  454472  454789 TRUE 
##  7     1 caiF    +         DNA-binding transcriptional ac…   34300   34695 TRUE 
##  8     1 cedA    -         cell division modulator         1813421 1813663 FALSE
##  9     1 cspA    +         cold shock protein CspA         3720049 3720261 TRUE 
## 10     1 cspC    -         stress protein, member of the … 1907226 1907435 FALSE
## # … with 410 more rows, and abbreviated variable names ¹left_pos, ²right_pos

Then we will manually define the 6 points on the gene arrow (for plotting purposes only).

df_poly <- df_genes_lib %>% 
 #filter(right_pos<20000) %>% 
 pivot_longer(c(left_pos, right_pos), values_to = 'x_pos', names_to = 'left_right') %>% 
 mutate( y_pos_bottom = 10,y_pos_top = 100, y_pos_center=31.623) %>% 
 pivot_longer(c(y_pos_bottom,y_pos_top, y_pos_center), values_to = 'y_pos', names_to = 'top_bottom') %>% 
 mutate(x_pos = ifelse(left_right == 'left_pos' & Direction == '-' & top_bottom !='y_pos_center', x_pos + 50, x_pos)) %>% 
 mutate(x_pos = ifelse(left_right == 'right_pos' & Direction == '+' & top_bottom !='y_pos_center', x_pos - 50, x_pos)) %>% 
 mutate(point_num = case_when(left_right == 'left_pos' & top_bottom == 'y_pos_bottom' ~ 1,
left_right == 'left_pos' & top_bottom == 'y_pos_center' ~ 2,
left_right == 'left_pos' & top_bottom == 'y_pos_top' ~ 3,
left_right == 'right_pos' & top_bottom == 'y_pos_top' ~ 4,
left_right == 'right_pos' & top_bottom == 'y_pos_center' ~ 5,
left_right == 'right_pos' & top_bottom == 'y_pos_bottom' ~ 6)) %>% 
 arrange(gene_id, point_num)

df_poly

## # A tibble: 2,520 × 10
##      lib gene_id      Direc…¹ Product fwd   left_…²  x_pos top_b…³ y_pos point…⁴
##    <dbl> <chr>        <chr>   <chr>   <lgl> <chr>    <dbl> <chr>   <dbl>   <dbl>
##  1     3 3'ETS-<i>le… -       small … FALSE left_p… 1.99e6 y_pos_…  10         1
##  2     3 3'ETS-<i>le… -       small … FALSE left_p… 1.99e6 y_pos_…  31.6       2
##  3     3 3'ETS-<i>le… -       small … FALSE left_p… 1.99e6 y_pos_… 100         3
##  4     3 3'ETS-<i>le… -       small … FALSE right_… 1.99e6 y_pos_… 100         4
##  5     3 3'ETS-<i>le… -       small … FALSE right_… 1.99e6 y_pos_…  31.6       5
##  6     3 3'ETS-<i>le… -       small … FALSE right_… 1.99e6 y_pos_…  10         6
##  7     3 C0293        +       small … TRUE  left_p… 1.20e6 y_pos_…  10         1
##  8     3 C0293        +       small … TRUE  left_p… 1.20e6 y_pos_…  31.6       2
##  9     3 C0293        +       small … TRUE  left_p… 1.20e6 y_pos_… 100         3
## 10     3 C0293        +       small … TRUE  right_… 1.20e6 y_pos_… 100         4
## # … with 2,510 more rows, and abbreviated variable names ¹Direction,
## #   ²left_right, ³top_bottom, ⁴point_num

Finally we will arrange the reads so they plot nicely.

df_read_nearest_up_down_org <- df_read_nearest_up_down %>% 
  arrange(lib, gene_id, up_down, perfect, width ) %>% 
  group_by(lib, gene_id, up_down, perfect) %>% 
  mutate(read_num = row_number())

df_read_nearest_up_down_org

## # A tibble: 522,522 × 15
## # Groups:   lib, gene_id, up_down, perfect [1,322]
##    seqnames strand cigar  qwidth  start    end width njunc   lib read_…¹ gene_id
##    <chr>    <chr>  <chr>   <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>  
##  1 U00096.3 -      7S27M      34 485734 485760    27     0     0  485760 acrR   
##  2 U00096.3 -      33M34S     67 485693 485725    33     0     0  485725 acrR   
##  3 U00096.3 -      27S46…     75 485719 485764    46     0     0  485764 acrR   
##  4 U00096.3 -      1S49M      50 485712 485760    49     0     0  485760 acrR   
##  5 U00096.3 -      58M        58 485703 485760    58     0     0  485760 acrR   
##  6 U00096.3 -      5S65M      70 485696 485760    65     0     0  485760 acrR   
##  7 U00096.3 -      1S67M      68 485694 485760    67     0     0  485760 acrR   
##  8 U00096.3 -      68M        68 485693 485760    68     0     0  485760 acrR   
##  9 U00096.3 -      70M        70 485691 485760    70     0     0  485760 acrR   
## 10 U00096.3 -      23M        23 485744 485766    23     0     0  485766 acrR   
## # … with 522,512 more rows, 4 more variables: min_val <dbl>, up_down <chr>,
## #   perfect <lgl>, read_num <int>, and abbreviated variable name
## #   ¹read_start_pos

Putting it all together for our 4 test genes, we will rasterize the read layer of the plot, so it doesn’t overwhelm adobe illustrator.

 kb_label <- function(x){
 # from s to ns
 lab <- paste(x / 1000)
 }

gene_list <- c('galK', 'hisA','metA', 'leuD')


plot_gene_examples <- ggplot()+
  geom_polygon(data = df_poly %>% filter(lib==0 & gene_id %in% gene_list)%>% mutate(gene_id = factor(gene_id, levels = gene_list)), aes(x = x_pos, y = y_pos, group = gene_id),fill = 'light gray')+
    rasterise(geom_segment(data = df_read_nearest_up_down_org %>% filter(lib==0 & gene_id %in% gene_list) %>% filter(min_val <500) %>% mutate(gene_id = factor(gene_id, levels = gene_list)),
             aes(x = start, xend = end, y = read_num, yend = read_num, color = perfect), alpha = 0.25, linewidth = 0.5), dpi = 2000, dev = "cairo")+
  labs(x = "Genomic position (Kb)", y = 'Read count')+
   scale_x_continuous(labels = kb_label)+
  scale_y_log10(limits =c(1, NA))+
  scale_color_manual(values = c("#440154FF","#21908CFF"))+
  guides(fill = 'none')+
  facet_wrap(~gene_id, scales = 'free', ncol = 1,strip.position = 'top') + guides(color = 'none')

plot_gene_examples

Lastly, we can count the perfect rate for the reads in the figure (within 500bp of TO side.)

df_read_nearest_up_down_org %>% 
  filter(lib==0 & gene_id %in% gene_list) %>%  
  group_by(gene_id, up_down) %>% 
  summarise(n=n(), perfect = sum(perfect), max_val = max(min_val))%>%
  mutate(per_perfect = perfect / n)

## # A tibble: 8 × 6
## # Groups:   gene_id [4]
##   gene_id up_down     n perfect max_val per_perfect
##   <chr>   <chr>   <int>   <int>   <dbl>       <dbl>
## 1 galK    down     2040    2012       6       0.986
## 2 galK    up       3268    3251      10       0.995
## 3 hisA    down     2476    2467       3       0.996
## 4 hisA    up       3046    3038       2       0.997
## 5 leuD    down       31      30       2       0.968
## 6 leuD    up         74      74       0       1    
## 7 metA    down      298     295       3       0.990
## 8 metA    up        505     503       4       0.996

Create figure 7

theme_set(theme_figure())


fig_7_bc_opool <- plot_grid(plot_bc, plot_gene_examples, ncol = 1,
                   align = 'hv', axis = 'lr', scale = 1,
                   labels = c('B','D'), rel_heights = c(1,2))


fig_7_bc_opool

#save_plot("../figures/fig_7_bc_opool.pdf", fig_7_bc_opool, base_width = 3, base_height = 6)

sessionInfo()

## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggrastr_1.0.1     kableExtra_1.3.4  cowplot_1.1.1     viridis_0.6.2    
##  [5] viridisLite_0.4.1 knitr_1.41        forcats_0.5.2     stringr_1.5.0    
##  [9] dplyr_1.1.0       purrr_0.3.5       readr_2.1.3       tidyr_1.2.1      
## [13] tibble_3.1.8      ggplot2_3.4.0     tidyverse_1.3.2  
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.4          sass_0.4.4          bit64_4.0.5        
##  [4] vroom_1.6.0         jsonlite_1.8.3      modelr_0.1.10      
##  [7] bslib_0.4.1         assertthat_0.2.1    highr_0.9          
## [10] vipor_0.4.5         googlesheets4_1.0.1 cellranger_1.1.0   
## [13] yaml_2.3.6          pillar_1.8.1        backports_1.4.1    
## [16] glue_1.6.2          digest_0.6.30       rvest_1.0.3        
## [19] colorspace_2.0-3    htmltools_0.5.4     pkgconfig_2.0.3    
## [22] broom_1.0.1         haven_2.5.1         scales_1.2.1       
## [25] webshot_0.5.4       svglite_2.1.0       tzdb_0.3.0         
## [28] timechange_0.1.1    googledrive_2.0.0   generics_0.1.3     
## [31] farver_2.1.1        ellipsis_0.3.2      cachem_1.0.6       
## [34] withr_2.5.0         cli_3.4.1           magrittr_2.0.3     
## [37] crayon_1.5.2        readxl_1.4.1        evaluate_0.18      
## [40] fs_1.5.2            fansi_1.0.3         xml2_1.3.3         
## [43] beeswarm_0.4.0      Cairo_1.6-0         tools_4.2.0        
## [46] hms_1.1.2           gargle_1.2.1        lifecycle_1.0.3    
## [49] munsell_0.5.0       reprex_2.0.2        compiler_4.2.0     
## [52] jquerylib_0.1.4     systemfonts_1.0.4   rlang_1.0.6        
## [55] grid_4.2.0          rstudioapi_0.14     labeling_0.4.2     
## [58] rmarkdown_2.18      gtable_0.3.1        DBI_1.1.3          
## [61] R6_2.5.1            gridExtra_2.3       lubridate_1.9.0    
## [64] fastmap_1.1.0       bit_4.0.5           utf8_1.2.2         
## [67] stringi_1.7.8       ggbeeswarm_0.7.1    parallel_4.2.0     
## [70] vctrs_0.5.2         dbplyr_2.2.1        tidyselect_1.2.0   
## [73] xfun_0.35

Figure 7: galKBC16N & IDT oPool

E. coli ORBIT 2022

Scott H. Saunders

Notes

Fig. 7B - Barcode counts

Fig. 7D - Example reads at test loci

Create figure 7