Notes
Fig. 4B - Deletion size efficiency
Fig. 4C - Deletion size accuracy
Fig. 4E - Integrating plasmid size efficiency (violacein operon)
Create Fig. 4

Notes

This figure contains several datatypes from experiments where ORBIT was used for large genomic deletions and integrations. Data figures were made in R notebooks and exported as pdfs. Cosmetic improvements were made in Adobe Illustrator. Note that Figures 4A, 4D & 4F were made in Adobe Illustrator.

Setup packages and plotting for the notebook:

# Check packages
source("../tools/package_setup.R")
# Load packages
library(tidyverse)
library(cowplot)
library(kableExtra)
# Code display options
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=FALSE, echo = TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.retina = 2)
# Load plotting tools
source("../tools/plotting_tools.R")
#Modify the plot theme
theme_set(theme_notebook())

Fig. 4B - Deletion size efficiency

The galK deletions were done separately from the other loci, so let’s read that data in first.

df_gal_sizes <- read_csv('../../data/low_throughput_experiments/2022_11_09_galK_sizes_3_efficiency.csv') %>% #read in csv
  mutate(eff = Kan_count / LB_count) %>% group_by(del_size) %>% mutate(avg_eff = mean(eff)) #calculate efficiency and average efficiency for replicates

#calculate negative control value from galK experiment
gal_sizes_pInt <- (df_gal_sizes %>% filter(del_size == '-'))$avg_eff[1]

df_gal_sizes %>% kable() %>% kable_styling() %>% scroll_box(height = '250px')

locus	del_size	replicate	LB_count	Kan_count	eff	avg_eff
galK	1122	1	13900000	33000	0.0023741	0.0026285
galK	1122	2	11100000	32000	0.0028829	0.0026285
galK	4258	1	10800000	1230	0.0001139	0.0001266
galK	4258	2	9900000	1380	0.0001394	0.0001266
galK	10762	1	9400000	430	0.0000457	0.0000592
galK	10762	2	9500000	690	0.0000726	0.0000592
galK	24027	1	10600000	990	0.0000934	0.0000920
galK	24027	2	11700000	1060	0.0000906	0.0000920
galK	49068	1	11400000	160	0.0000140	0.0000164
galK	49068	2	15000000	280	0.0000187	0.0000164
galK		1	11900000	240	0.0000202	0.0000243
galK		2	8800000	250	0.0000284	0.0000243

Now we will read in the data for the other loci (hisA, metA, and leuD).

df_aa_sizes <- read_csv('../../data/low_throughput_experiments/2022_09_07_AA_del_sizes_eff.csv') %>% #read in csv
  mutate(eff = Kan_count / LB_count) %>% group_by(locus, del_size) %>% mutate(avg_eff = mean(eff)) #calculate efficiency and average efficiency for replicates

df_aa_sizes %>% kable() %>% kable_styling() %>% scroll_box(height = '250px')

locus	del_size	replicate	LB_count	Kan_count	eff	avg_eff
hisA	100	1	370000	1120	0.0030270	0.0024635
hisA	100	2	900000	1710	0.0019000	0.0024635
hisA	700	1	800000	5100	0.0063750	0.0061420
hisA	700	2	880000	5200	0.0059091	0.0061420
hisA	7000	1	780000	100	0.0001282	0.0001195
hisA	7000	2	830000	92	0.0001108	0.0001195
metA	100	1	780000	8400	0.0107692	0.0102826
metA	100	2	980000	9600	0.0097959	0.0102826
metA	900	1	1250000	6000	0.0048000	0.0044886
metA	900	2	790000	3300	0.0041772	0.0044886
metA	13000	1	890000	1010	0.0011348	0.0010500
metA	13000	2	860000	830	0.0009651	0.0010500
leuD	100	1	880000	1370	0.0015568	0.0014955
leuD	100	2	760000	1090	0.0014342	0.0014955
leuD	600	1	1010000	720	0.0007129	0.0005181
leuD	600	2	990000	320	0.0003232	0.0005181
leuD	6000	1	760000	430	0.0005658	0.0005980
leuD	6000	2	730000	460	0.0006301	0.0005980
galK	1000	1	700000	3600	0.0051429	0.0039413
galK	1000	2	730000	2000	0.0027397	0.0039413

#combine datasets
df_gal_aa_sizes <- bind_rows(df_gal_sizes %>% filter(del_size!='-') %>% mutate(del_size = as.numeric(del_size)), 
                             df_aa_sizes %>% filter(locus!='galK')
                             ) %>% 
  mutate(locus = factor(locus, levels = c( 'galK','hisA','metA','leuD')))

#Plot with individual datapoints, and mean points connected by lines
plot_gal_aa_sizes <- ggplot(df_gal_aa_sizes, aes(x = del_size, y = eff, color = locus)) + 
  geom_hline(yintercept = gal_sizes_pInt, linetype = 2, color = 'light gray')+
  geom_jitter(shape = 21, width = 0.05, height = 0, alpha = 0.4) + 
  geom_point(data = . %>% filter(replicate==1), aes(y = avg_eff)) + 
  geom_line(data = . %>% filter(replicate==1), aes(y = avg_eff)) + 
  scale_y_log10(labels = scales::label_percent(accuracy = 0.01)) + 
  scale_x_log10(breaks = c(100, 1000, 10000, 50000), labels = c('100 bp', '1 kb', '10 kb', '50 kb'))+
  labs(y = 'Efficiency', x = 'Deletion Size')+
  scale_color_viridis_d()

plot_gal_aa_sizes

Fig. 4C - Deletion size accuracy

Again galK was done separately from the other loci, so let’s read in that data first:

df_galK_pin <- read_csv("../../data/low_throughput_experiments/2022_11_09_galK_sizes_3_accuracy.csv") %>% 
  mutate(accuracy = 1-(selective_colonies / permissive_colonies)) %>% 
  group_by(locus, deletion_size) %>% 
  mutate(avg_accuracy = mean(accuracy))

df_galK_pin

## # A tibble: 12 × 9
## # Groups:   locus, deletion_size [6]
##    condition locus deletion_size pInt_…¹ repli…² permi…³ selec…⁴ accur…⁵ avg_a…⁶
##    <chr>     <chr> <chr>         <lgl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 galK_1kb  galK  1122          FALSE         1       8       0   1      1     
##  2 galK_1kb  galK  1122          FALSE         2       8       0   1      1     
##  3 galK_4kb  galK  4258          FALSE         1       8       0   1      0.938 
##  4 galK_4kb  galK  4258          FALSE         2       8       1   0.875  0.938 
##  5 galK_10kb galK  10762         FALSE         1       8       1   0.875  0.875 
##  6 galK_10kb galK  10762         FALSE         2       8       1   0.875  0.875 
##  7 galK_24kb galK  24027         FALSE         1       8       0   1      1     
##  8 galK_24kb galK  24027         FALSE         2       8       0   1      1     
##  9 galK_49kb galK  49068         FALSE         1       8       1   0.875  0.875 
## 10 galK_49kb galK  49068         FALSE         2       8       1   0.875  0.875 
## 11 pInt_only galK  -             TRUE          1       8       8   0      0.0625
## 12 pInt_only galK  -             TRUE          2       8       7   0.125  0.0625
## # … with abbreviated variable names ¹pInt_only, ²replicate,
## #   ³permissive_colonies, ⁴selective_colonies, ⁵accuracy, ⁶avg_accuracy

Then we will read in the data for hisA, metA and leuD:

df_aa_pin <- read_csv("../../data/low_throughput_experiments/2022_09_07_AA_sizes_pin_plate_count.csv") %>% 
  mutate(accuracy = 1-(selective_colonies / permissive_colonies)) %>% 
  group_by(locus, deletion_size) %>% 
  mutate(avg_accuracy = mean(accuracy)) %>% 
  mutate(locus = factor(locus, levels = c('hisA','metA','leuD')))

df_aa_pin

## # A tibble: 18 × 9
## # Groups:   locus, deletion_size [9]
##    condition       locus delet…¹ pInt_…² repli…³ permi…⁴ selec…⁵ accur…⁶ avg_a…⁷
##    <chr>           <fct>   <dbl> <lgl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 hisA 100 bp del hisA      100 FALSE         1       7       0   1       1    
##  2 hisA 100 bp del hisA      100 FALSE         2       7       0   1       1    
##  3 hisA 700 bp del hisA      700 FALSE         1       7       0   1       1    
##  4 hisA 700 bp del hisA      700 FALSE         2       7       0   1       1    
##  5 hisA 7000 bp d… hisA     7000 FALSE         1       7       0   1       0.929
##  6 hisA 7000 bp d… hisA     7000 FALSE         2       7       1   0.857   0.929
##  7 metA 100 bp del metA      100 FALSE         1       6       0   1       1    
##  8 metA 100 bp del metA      100 FALSE         2       6       0   1       1    
##  9 metA 900 bp del metA      900 FALSE         1       7       0   1       1    
## 10 metA 900 bp del metA      900 FALSE         2       6       0   1       1    
## 11 metA 13000 bp … metA    13000 FALSE         1       7       0   1       1    
## 12 metA 13000 bp … metA    13000 FALSE         2       7       0   1       1    
## 13 leuD 100 bp del leuD      100 FALSE         1       7       0   1       1    
## 14 leuD 100 bp del leuD      100 FALSE         2       6       0   1       1    
## 15 leuD 600 bp del leuD      600 FALSE         1       7       0   1       0.929
## 16 leuD 600 bp del leuD      600 FALSE         2       7       1   0.857   0.929
## 17 leuD 6000 bp d… leuD     6000 FALSE         1       7       0   1       0.929
## 18 leuD 6000 bp d… leuD     6000 FALSE         2       7       1   0.857   0.929
## # … with abbreviated variable names ¹deletion_size, ²pInt_only, ³replicate,
## #   ⁴permissive_colonies, ⁵selective_colonies, ⁶accuracy, ⁷avg_accuracy

We will combine the data and plot:

#Combine data
df_galK_aa_pin <- bind_rows(df_galK_pin %>% filter(condition != 'pInt_only') %>% mutate(deletion_size = as.numeric(deletion_size)), df_aa_pin)

#Get background values
galK_bg <- (df_galK_pin %>% filter(condition == 'pInt_only' & replicate == 1))$avg_accuracy

#Plot with individual datapoints, and mean points connected by lines
plot_del_accuracy <- ggplot(df_galK_aa_pin, aes(x = deletion_size, y = accuracy, color = locus)) + 
  geom_hline(yintercept = 0, linetype = 2, color = 'light gray')+ #background accuracy for AA loci was 0.
    geom_hline(yintercept = galK_bg, linetype = 4, color = 'light gray')+ #background accuracy for galK is shown separately here.
  geom_line(data = . %>% filter(replicate==1), aes(y = avg_accuracy)) + 
  geom_jitter(shape = 21, width = 0.05, height = 0, alpha = 0.4) + 
  geom_point(data = . %>% filter(replicate==1), aes(y = avg_accuracy)) + 
  scale_y_continuous(labels = scales::label_percent(accuracy = 1), limits = c(0,1)) + 
  scale_x_log10(breaks = c(100, 1000, 10000, 50000), labels = c('100 bp', '1 kb', '10 kb', '50 kb'))+
  labs(y = 'Phenotypic accuracy', x = 'Deletion Size')+
  scale_color_viridis_d()

plot_del_accuracy

Fig. 4E - Integrating plasmid size efficiency (violacein operon)

For this experiment different sized fragments from the luxR inducible violacein operon were cloned into an integrating plasmid. Efficiency was measured for each plasmid with the ∆galK TO. Let’s read in the data:

df_vio <- read_csv("../../data/low_throughput_experiments/2022_10_25_pInt_vio_sizes.csv") %>% 
  mutate(eff = Kan_count / LB_count) %>% 
  group_by(condition, to, plasmid_size) %>% mutate(avg_eff = mean(eff)) #calculate efficiency and average efficiency for replicates

df_vio %>% kable() %>% kable_styling() %>% scroll_box(height = '250px')

condition	to	plasmid_size	replicate	LB_count	Kan_count	eff	avg_eff
p265 + pInt_kan	265	1958	1	1140000	490	0.0004298	0.0004158
p265 + pInt_kan	265	1958	2	1120000	450	0.0004018	0.0004158
p265 + pInt_vioA	265	3628	1	1080000	590	0.0005463	0.0005694
p265 + pInt_vioA	265	3628	2	1080000	640	0.0005926	0.0005694
p265 + pInt_vioAB	265	6718	1	1120000	360	0.0003214	0.0002468
p265 + pInt_vioAB	265	6718	2	1220000	210	0.0001721	0.0002468
p265 + pInt_vioABC	265	8203	1	1210000	53	0.0000438	0.0000418
p265 + pInt_vioABC	265	8203	2	1280000	51	0.0000398	0.0000418
pInt_vioABC only	control	8203	1	1040000	2	0.0000019	0.0000019
p265 + pInt_luxR_vioAE	265	10671	1	1290000	32	0.0000248	0.0000230
p265 + pInt_luxR_vioAE	265	10671	2	1130000	24	0.0000212	0.0000230
pInt_luxR_vioAE only	control	10671	1	1400000	0	0.0000000	0.0000000

Now let’s plot:

plot_vio <- ggplot(df_vio%>% filter(to != 'control'), aes(x = plasmid_size, y = eff)) + 
  geom_jitter(shape = 21, width = 200, height = 0, color = "#440154FF", alpha = 0.4) + 
  geom_jitter(data = df_vio %>% filter(to == 'control'), shape = 4, width = 0, height = 0, color = 'light gray') + #negative control efficiencies for pInt_vioA-C and pInt_luxR_vioA-E are shown here
  geom_point(data = . %>% filter(replicate ==1), aes(y = avg_eff), color = "#440154FF") + 
  geom_line(data = . %>% filter(replicate ==1), aes(y = avg_eff, group = to), color = "#440154FF") + 
  scale_y_log10(limits = c(NA,0.001), labels = scales::label_percent(accuracy = 0.001))+
  scale_x_continuous(breaks = c(2000, 6000, 10000), labels = c('2 kb', '6 kb', '10 kb'))+
  labs(x = 'Integrating plasmid size', y = 'Efficiency')

plot_vio

Create Fig. 4

theme_set(theme_figure())

fig_4_sizes <- plot_grid(plot_gal_aa_sizes + guides(color = 'none'), plot_vio, nrow = 1,
                   align = 'hv', axis = 'lr', scale = 1,
                   labels = c('B','E'))

fig_4_sizes <- plot_grid(plot_gal_aa_sizes + guides(color = 'none'),plot_del_accuracy + guides(color = 'none'), 
                         plot_vio,NULL,
                         nrow = 2, ncol = 2,
                   align = 'hv', axis = 'lr', scale = 1,
                   labels = c('B','C','E'))

fig_4_sizes

save_plot("../../figures/r_pdf_figs/main_figs/fig_4_del_int_sizes.pdf", fig_4_sizes, base_width = 5, base_height = 4)

sessionInfo()

## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.3.4  cowplot_1.1.1     viridis_0.6.2     viridisLite_0.4.1
##  [5] knitr_1.41        forcats_0.5.2     stringr_1.5.0     dplyr_1.1.0      
##  [9] purrr_0.3.5       readr_2.1.3       tidyr_1.2.1       tibble_3.1.8     
## [13] ggplot2_3.4.0     tidyverse_1.3.2  
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.4          sass_0.4.4          bit64_4.0.5        
##  [4] vroom_1.6.0         jsonlite_1.8.3      modelr_0.1.10      
##  [7] bslib_0.4.1         assertthat_0.2.1    highr_0.9          
## [10] googlesheets4_1.0.1 cellranger_1.1.0    yaml_2.3.6         
## [13] pillar_1.8.1        backports_1.4.1     glue_1.6.2         
## [16] digest_0.6.30       rvest_1.0.3         colorspace_2.0-3   
## [19] htmltools_0.5.4     pkgconfig_2.0.3     broom_1.0.1        
## [22] haven_2.5.1         scales_1.2.1        webshot_0.5.4      
## [25] svglite_2.1.0       tzdb_0.3.0          timechange_0.1.1   
## [28] googledrive_2.0.0   generics_0.1.3      farver_2.1.1       
## [31] ellipsis_0.3.2      cachem_1.0.6        withr_2.5.0        
## [34] cli_3.4.1           magrittr_2.0.3      crayon_1.5.2       
## [37] readxl_1.4.1        evaluate_0.18       fs_1.5.2           
## [40] fansi_1.0.3         xml2_1.3.3          textshaping_0.3.6  
## [43] tools_4.2.0         hms_1.1.2           gargle_1.2.1       
## [46] lifecycle_1.0.3     munsell_0.5.0       reprex_2.0.2       
## [49] compiler_4.2.0      jquerylib_0.1.4     systemfonts_1.0.4  
## [52] rlang_1.0.6         grid_4.2.0          rstudioapi_0.14    
## [55] labeling_0.4.2      rmarkdown_2.18      gtable_0.3.1       
## [58] DBI_1.1.3           R6_2.5.1            gridExtra_2.3      
## [61] lubridate_1.9.0     fastmap_1.1.0       bit_4.0.5          
## [64] utf8_1.2.2          ragg_1.2.5          stringi_1.7.8      
## [67] parallel_4.2.0      vctrs_0.5.2         dbplyr_2.2.1       
## [70] tidyselect_1.2.0    xfun_0.35

Figure 4: Large deletions and genomic integrations

E. coli ORBIT 2023

Scott H. Saunders

Notes

Fig. 4B - Deletion size efficiency

Fig. 4C - Deletion size accuracy

Fig. 4E - Integrating plasmid size efficiency (violacein operon)

Create Fig. 4