
This figure contains several datatypes from experiments where ORBIT was used for large genomic deletions and integrations. Data figures were made in R notebooks and exported as pdfs. Cosmetic improvements were made in Adobe Illustrator. Note that Figures 4A, 4D & 4F were made in Adobe Illustrator.

Setup packages and plotting for the notebook:

knitr::opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=FALSE, echo = TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.retina = 2)
#Modify the plot theme

Fig. 4B - Deletion size efficiency

The galK deletions were done separately from the other loci, so let’s read that data in first.

df_gal_sizes <- read_csv('../../data/low_throughput_experiments/2022_11_09_galK_sizes_3_efficiency.csv') %>% #read in csv
  mutate(eff = Kan_count / LB_count) %>% group_by(del_size) %>% mutate(avg_eff = mean(eff)) #calculate efficiency and average efficiency for replicates

#calculate negative control value from galK experiment
gal_sizes_pInt <- (df_gal_sizes %>% filter(del_size == '-'))$avg_eff[1]

df_gal_sizes %>% kable() %>% kable_styling() %>% scroll_box(height = '250px')
locus del_size replicate LB_count Kan_count eff avg_eff
galK 1122 1 13900000 33000 0.0023741 0.0026285
galK 1122 2 11100000 32000 0.0028829 0.0026285
galK 4258 1 10800000 1230 0.0001139 0.0001266
galK 4258 2 9900000 1380 0.0001394 0.0001266
galK 10762 1 9400000 430 0.0000457 0.0000592
galK 10762 2 9500000 690 0.0000726 0.0000592
galK 24027 1 10600000 990 0.0000934 0.0000920
galK 24027 2 11700000 1060 0.0000906 0.0000920
galK 49068 1 11400000 160 0.0000140 0.0000164
galK 49068 2 15000000 280 0.0000187 0.0000164
1 11900000 240 0.0000202 0.0000243
2 8800000 250 0.0000284 0.0000243

Now we will read in the data for the other loci (hisA, metA, and leuD).

df_aa_sizes <- read_csv('../../data/low_throughput_experiments/2022_09_07_AA_del_sizes_eff.csv') %>% #read in csv
  mutate(eff = Kan_count / LB_count) %>% group_by(locus, del_size) %>% mutate(avg_eff = mean(eff)) #calculate efficiency and average efficiency for replicates

df_aa_sizes %>% kable() %>% kable_styling() %>% scroll_box(height = '250px')
locus del_size replicate LB_count Kan_count eff avg_eff
hisA 100 1 370000 1120 0.0030270 0.0024635
hisA 100 2 900000 1710 0.0019000 0.0024635
hisA 700 1 800000 5100 0.0063750 0.0061420
hisA 700 2 880000 5200 0.0059091 0.0061420
hisA 7000 1 780000 100 0.0001282 0.0001195
hisA 7000 2 830000 92 0.0001108 0.0001195
metA 100 1 780000 8400 0.0107692 0.0102826
metA 100 2 980000 9600 0.0097959 0.0102826
metA 900 1 1250000 6000 0.0048000 0.0044886
metA 900 2 790000 3300 0.0041772 0.0044886
metA 13000 1 890000 1010 0.0011348 0.0010500
metA 13000 2 860000 830 0.0009651 0.0010500
leuD 100 1 880000 1370 0.0015568 0.0014955
leuD 100 2 760000 1090 0.0014342 0.0014955
leuD 600 1 1010000 720 0.0007129 0.0005181
leuD 600 2 990000 320 0.0003232 0.0005181
leuD 6000 1 760000 430 0.0005658 0.0005980
leuD 6000 2 730000 460 0.0006301 0.0005980
galK 1000 1 700000 3600 0.0051429 0.0039413
galK 1000 2 730000 2000 0.0027397 0.0039413
#combine datasets
df_gal_aa_sizes <- bind_rows(df_gal_sizes %>% filter(del_size!='-') %>% mutate(del_size = as.numeric(del_size)), 
                             df_aa_sizes %>% filter(locus!='galK')
                             ) %>% 
  mutate(locus = factor(locus, levels = c( 'galK','hisA','metA','leuD')))

#Plot with individual datapoints, and mean points connected by lines
plot_gal_aa_sizes <- ggplot(df_gal_aa_sizes, aes(x = del_size, y = eff, color = locus)) + 
  geom_hline(yintercept = gal_sizes_pInt, linetype = 2, color = 'light gray')+
  geom_jitter(shape = 21, width = 0.05, height = 0, alpha = 0.4) + 
  geom_point(data = . %>% filter(replicate==1), aes(y = avg_eff)) + 
  geom_line(data = . %>% filter(replicate==1), aes(y = avg_eff)) + 
  scale_y_log10(labels = scales::label_percent(accuracy = 0.01)) + 
  scale_x_log10(breaks = c(100, 1000, 10000, 50000), labels = c('100 bp', '1 kb', '10 kb', '50 kb'))+
  labs(y = 'Efficiency', x = 'Deletion Size')+


Fig. 4C - Deletion size accuracy

Again galK was done separately from the other loci, so let’s read in that data first:

df_galK_pin <- read_csv("../../data/low_throughput_experiments/2022_11_09_galK_sizes_3_accuracy.csv") %>% 
  mutate(accuracy = 1-(selective_colonies / permissive_colonies)) %>% 
  group_by(locus, deletion_size) %>% 
  mutate(avg_accuracy = mean(accuracy))

## # A tibble: 12 × 9
## # Groups:   locus, deletion_size [6]
##    condition locus deletion_size pInt_…¹ repli…² permi…³ selec…⁴ accur…⁵ avg_a…⁶
##    <chr>     <chr> <chr>         <lgl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 galK_1kb  galK  1122          FALSE         1       8       0   1      1     
##  2 galK_1kb  galK  1122          FALSE         2       8       0   1      1     
##  3 galK_4kb  galK  4258          FALSE         1       8       0   1      0.938 
##  4 galK_4kb  galK  4258          FALSE         2       8       1   0.875  0.938 
##  5 galK_10kb galK  10762         FALSE         1       8       1   0.875  0.875 
##  6 galK_10kb galK  10762         FALSE         2       8       1   0.875  0.875 
##  7 galK_24kb galK  24027         FALSE         1       8       0   1      1     
##  8 galK_24kb galK  24027         FALSE         2       8       0   1      1     
##  9 galK_49kb galK  49068         FALSE         1       8       1   0.875  0.875 
## 10 galK_49kb galK  49068         FALSE         2       8       1   0.875  0.875 
## 11 pInt_only galK  -             TRUE          1       8       8   0      0.0625
## 12 pInt_only galK  -             TRUE          2       8       7   0.125  0.0625
## # … with abbreviated variable names ¹​pInt_only, ²​replicate,
## #   ³​permissive_colonies, ⁴​selective_colonies, ⁵​accuracy, ⁶​avg_accuracy

Then we will read in the data for hisA, metA and leuD:

df_aa_pin <- read_csv("../../data/low_throughput_experiments/2022_09_07_AA_sizes_pin_plate_count.csv") %>% 
  mutate(accuracy = 1-(selective_colonies / permissive_colonies)) %>% 
  group_by(locus, deletion_size) %>% 
  mutate(avg_accuracy = mean(accuracy)) %>% 
  mutate(locus = factor(locus, levels = c('hisA','metA','leuD')))

## # A tibble: 18 × 9
## # Groups:   locus, deletion_size [9]
##    condition       locus delet…¹ pInt_…² repli…³ permi…⁴ selec…⁵ accur…⁶ avg_a…⁷
##    <chr>           <fct>   <dbl> <lgl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 hisA 100 bp del hisA      100 FALSE         1       7       0   1       1    
##  2 hisA 100 bp del hisA      100 FALSE         2       7       0   1       1    
##  3 hisA 700 bp del hisA      700 FALSE         1       7       0   1       1    
##  4 hisA 700 bp del hisA      700 FALSE         2       7       0   1       1    
##  5 hisA 7000 bp d… hisA     7000 FALSE         1       7       0   1       0.929
##  6 hisA 7000 bp d… hisA     7000 FALSE         2       7       1   0.857   0.929
##  7 metA 100 bp del metA      100 FALSE         1       6       0   1       1    
##  8 metA 100 bp del metA      100 FALSE         2       6       0   1       1    
##  9 metA 900 bp del metA      900 FALSE         1       7       0   1       1    
## 10 metA 900 bp del metA      900 FALSE         2       6       0   1       1    
## 11 metA 13000 bp … metA    13000 FALSE         1       7       0   1       1    
## 12 metA 13000 bp … metA    13000 FALSE         2       7       0   1       1    
## 13 leuD 100 bp del leuD      100 FALSE         1       7       0   1       1    
## 14 leuD 100 bp del leuD      100 FALSE         2       6       0   1       1    
## 15 leuD 600 bp del leuD      600 FALSE         1       7       0   1       0.929
## 16 leuD 600 bp del leuD      600 FALSE         2       7       1   0.857   0.929
## 17 leuD 6000 bp d… leuD     6000 FALSE         1       7       0   1       0.929
## 18 leuD 6000 bp d… leuD     6000 FALSE         2       7       1   0.857   0.929
## # … with abbreviated variable names ¹​deletion_size, ²​pInt_only, ³​replicate,
## #   ⁴​permissive_colonies, ⁵​selective_colonies, ⁶​accuracy, ⁷​avg_accuracy

We will combine the data and plot:

#Combine data
df_galK_aa_pin <- bind_rows(df_galK_pin %>% filter(condition != 'pInt_only') %>% mutate(deletion_size = as.numeric(deletion_size)), df_aa_pin)

#Get background values
galK_bg <- (df_galK_pin %>% filter(condition == 'pInt_only' & replicate == 1))$avg_accuracy

#Plot with individual datapoints, and mean points connected by lines
plot_del_accuracy <- ggplot(df_galK_aa_pin, aes(x = deletion_size, y = accuracy, color = locus)) + 
  geom_hline(yintercept = 0, linetype = 2, color = 'light gray')+ #background accuracy for AA loci was 0.
    geom_hline(yintercept = galK_bg, linetype = 4, color = 'light gray')+ #background accuracy for galK is shown separately here.
  geom_line(data = . %>% filter(replicate==1), aes(y = avg_accuracy)) + 
  geom_jitter(shape = 21, width = 0.05, height = 0, alpha = 0.4) + 
  geom_point(data = . %>% filter(replicate==1), aes(y = avg_accuracy)) + 
  scale_y_continuous(labels = scales::label_percent(accuracy = 1), limits = c(0,1)) + 
  scale_x_log10(breaks = c(100, 1000, 10000, 50000), labels = c('100 bp', '1 kb', '10 kb', '50 kb'))+
  labs(y = 'Phenotypic accuracy', x = 'Deletion Size')+


Fig. 4E - Integrating plasmid size efficiency (violacein operon)

For this experiment different sized fragments from the luxR inducible violacein operon were cloned into an integrating plasmid. Efficiency was measured for each plasmid with the ∆galK TO. Let’s read in the data:

df_vio <- read_csv("../../data/low_throughput_experiments/2022_10_25_pInt_vio_sizes.csv") %>% 
  mutate(eff = Kan_count / LB_count) %>% 
  group_by(condition, to, plasmid_size) %>% mutate(avg_eff = mean(eff)) #calculate efficiency and average efficiency for replicates

df_vio %>% kable() %>% kable_styling() %>% scroll_box(height = '250px')
condition to plasmid_size replicate LB_count Kan_count eff avg_eff
p265 + pInt_kan 265 1958 1 1140000 490 0.0004298 0.0004158
p265 + pInt_kan 265 1958 2 1120000 450 0.0004018 0.0004158
p265 + pInt_vioA 265 3628 1 1080000 590 0.0005463 0.0005694
p265 + pInt_vioA 265 3628 2 1080000 640 0.0005926 0.0005694
p265 + pInt_vioAB 265 6718 1 1120000 360 0.0003214 0.0002468
p265 + pInt_vioAB 265 6718 2 1220000 210 0.0001721 0.0002468
p265 + pInt_vioABC 265 8203 1 1210000 53 0.0000438 0.0000418
p265 + pInt_vioABC 265 8203 2 1280000 51 0.0000398 0.0000418
pInt_vioABC only control 8203 1 1040000 2 0.0000019 0.0000019
p265 + pInt_luxR_vioAE 265 10671 1 1290000 32 0.0000248 0.0000230
p265 + pInt_luxR_vioAE 265 10671 2 1130000 24 0.0000212 0.0000230
pInt_luxR_vioAE only control 10671 1 1400000 0 0.0000000 0.0000000

Now let’s plot:

plot_vio <- ggplot(df_vio%>% filter(to != 'control'), aes(x = plasmid_size, y = eff)) + 
  geom_jitter(shape = 21, width = 200, height = 0, color = "#440154FF", alpha = 0.4) + 
  geom_jitter(data = df_vio %>% filter(to == 'control'), shape = 4, width = 0, height = 0, color = 'light gray') + #negative control efficiencies for pInt_vioA-C and pInt_luxR_vioA-E are shown here
  geom_point(data = . %>% filter(replicate ==1), aes(y = avg_eff), color = "#440154FF") + 
  geom_line(data = . %>% filter(replicate ==1), aes(y = avg_eff, group = to), color = "#440154FF") + 
  scale_y_log10(limits = c(NA,0.001), labels = scales::label_percent(accuracy = 0.001))+
  scale_x_continuous(breaks = c(2000, 6000, 10000), labels = c('2 kb', '6 kb', '10 kb'))+
  labs(x = 'Integrating plasmid size', y = 'Efficiency')


Create Fig. 4


fig_4_sizes <- plot_grid(plot_gal_aa_sizes + guides(color = 'none'), plot_vio, nrow = 1,
                   align = 'hv', axis = 'lr', scale = 1,
                   labels = c('B','E'))

fig_4_sizes <- plot_grid(plot_gal_aa_sizes + guides(color = 'none'),plot_del_accuracy + guides(color = 'none'), 
                         nrow = 2, ncol = 2,
                   align = 'hv', axis = 'lr', scale = 1,
                   labels = c('B','C','E'))


save_plot("../../figures/r_pdf_figs/main_figs/fig_4_del_int_sizes.pdf", fig_4_sizes, base_width = 5, base_height = 4)
