library(tidyverse)
#library(GenomicAlignments)
library(kableExtra)
library(ggridges)
library(ggbeeswarm)
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=FALSE, echo = TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.retina = 2)

source("../tools/plotting_tools.R")
theme_set(theme_notebook())

Notes

This notebook performs the combined junction 1 and junction 2 analysis on the mapped reads for the four ORBIT libraries that were constructed. As a reminder, these different deletion libraries were as follows:

Library #0: galK, hisA, metA, leuD (gold standards) + 23 TF genes (27 targets total)

Library #1: All TF genes < 575 bp (76 targets total)

Library #2: All TF genes > 575 bp (226 targets total)

Library #3: All small RNA genes (90 targets total)

Recall that for each ORBIT mutant a deletion was made and the integrating plasmid was incorporated at that genomic position. Therefore the sequenced junctions bridge the integrating plasmid to the adjacent genome for each mutant.

The junction 1 libraries were processed and analyzed in the notebooks code/seq_data_processing/left_side_TO_libs_processing.Rmd and code/seq_data_processing/left_side_TO_libs_analysis.Rmd. The junction 2 libraries were processed and analyzed in the notebooks code/seq_data_processing/left_side_TO_libs_processing.Rmd and code/seq_data_processing/left_side_TO_libs_analysis.Rmd.

First we will import the junction 1 and 2 reads that were assigned to our target mutants (or off target):

df_read_nearest_down <- read_csv('../../data/seq_data/left_side_TO_libs/df_read_nearest_left.csv')
df_read_nearest_up <- read_csv('../../data/seq_data/right_side_TO_libs/df_read_nearest_right.csv')

df_read_nearest_down

## # A tibble: 204,833 × 12
##    seqnames strand cigar  qwidth  start    end width njunc   lib read_…¹ gene_id
##    <chr>    <chr>  <chr>   <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>  
##  1 U00096.3 +      55M6S      61 2.98e6 2.98e6    55     0     0 2979936 lysR   
##  2 U00096.3 +      30M        30 2.98e6 2.98e6    30     0     0 2979936 lysR   
##  3 U00096.3 +      48M        48 2.98e6 2.98e6    48     0     0 2979936 lysR   
##  4 U00096.3 +      52M5S      57 2.98e6 2.98e6    52     0     0 2979936 lysR   
##  5 U00096.3 +      39M        39 2.98e6 2.98e6    39     0     0 2979936 lysR   
##  6 U00096.3 +      32M1D…     74 2.98e6 2.98e6    75     0     0 2977610 galR   
##  7 U00096.3 +      57M        57 2.10e6 2.10e6    57     0     0 2095844 hisA   
##  8 U00096.3 -      75M        75 4.10e6 4.10e6    75     0     0 4098651 rhaR   
##  9 U00096.3 -      45M        45 4.17e5 4.17e5    45     0     0  417147 phoB   
## 10 U00096.3 -      61M        61 4.17e5 4.17e5    61     0     0  417147 phoB   
## # … with 204,823 more rows, 1 more variable: min_val <dbl>, and abbreviated
## #   variable name ¹read_start_pos

df_read_nearest_up

## # A tibble: 331,853 × 12
##    seqnames strand cigar qwidth   start    end width njunc   lib read_…¹ gene_id
##    <chr>    <chr>  <chr>  <dbl>   <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>  
##  1 U00096.3 -      79M2S     81 2978946 2.98e6    79     0     0 2979024 lysR   
##  2 U00096.3 -      6S47M     53 2095079 2.10e6    47     0     0 2095125 hisA   
##  3 U00096.3 +      81M       81 4277913 4.28e6    81     0     0 4277913 soxR   
##  4 U00096.3 -      76M       76 3535810 3.54e6    76     0     0 3535885 ompR   
##  5 U00096.3 +      2S78M     80  317133 3.17e5    78     0     0  317133 ykgA   
##  6 U00096.3 -      51M       51 3059708 3.06e6    51     0     0 3059758 argP   
##  7 U00096.3 -      2S30M     32 2095096 2.10e6    30     0     0 2095125 hisA   
##  8 U00096.3 -      81M       81 3535805 3.54e6    81     0     0 3535885 ompR   
##  9 U00096.3 +      75M       75 4099474 4.10e6    75     0     0 4099474 rhaR   
## 10 U00096.3 +      34M2S     36  417811 4.18e5    34     0     0  417811 phoB   
## # … with 331,843 more rows, 1 more variable: min_val <dbl>, and abbreviated
## #   variable name ¹read_start_pos

We will also go ahead and import our list of designed target mutations (and targeting oligos).

df_TOs <- read_csv("../../data/seq_data/right_side_TO_libs/df_TOs.csv")

df_TOs

## # A tibble: 420 × 7
##      lib gene  left_oligo_pos right_oligo_pos oligo              product go_term
##    <dbl> <chr>          <dbl>           <dbl> <chr>              <chr>   <chr>  
##  1     1 alpA         2758649         2758836 CTCTCTGCAACCAAAGT… CP4-57… DNA bi…
##  2     1 argR         3384708         3385153 GCACAATAATGTTGTAT… DNA-bi… DNA bi…
##  3     1 ariR         1216374         1216615 AAACTTATACTTAATAA… putati… protei…
##  4     1 arsR         3648533         3648861 TCGAAGAGAGACACTAC… DNA-bi… transc…
##  5     1 asnC         3926565         3926998 TAAAATAGAATGAATCA… DNA-bi… protei…
##  6     1 bolA          454477          454769 GCGCCGACAGTTGCAAA… DNA-bi… protei…
##  7     1 caiF           34305           34675 ACGCCCGGTATCAACGA… DNA-bi… DNA-bi…
##  8     1 cedA         1813441         1813658 GCAAAACCGGGAGGTAT… cell d… DNA bi…
##  9     1 cspA         3720054         3720241 TCGCCGAAAGGCACACT… cold s… nuclei…
## 10     1 cspC         1907246         1907430 CACACTTCAGATCAGTG… stress… nuclei…
## # … with 410 more rows

Read centric analysis

We previously examined the junction 1 and junction 2 libraries individual in their respective analysis notebooks. Here we will combine junction 1 and 2 data to see how many of the reads correspond to designed mutations.

First we will join the datasets and then we can plot the combined read metrics for both libraries:

df_read_nearest_up_down <- bind_rows(df_read_nearest_up %>% mutate(up_down = 'up'),df_read_nearest_down %>%  mutate(up_down = 'down')) %>% 
  mutate(off_target = ifelse(min_val >500, T, F)) %>% 
  mutate(perfect = ifelse(min_val == 0, T, F))

df_read_nearest_up_down %>% 
  mutate(status = paste0('off_',off_target, '_p_',perfect)) %>% 
  mutate(status = fct_relevel(status, 'off_TRUE_p_FALSE','off_FALSE_p_FALSE','off_FALSE_p_TRUE')) %>% 
  ggplot(aes(x = lib, fill = status)) + 
    geom_bar(color = 'black',position = 'fill') +
    scale_y_continuous(labels = scales::label_percent())+
    scale_fill_viridis_d(labels = c('Off target / Not Perfect','On Target / Not Perfect','On Target / Perfect'), option = 'A')  + 
    labs(y = 'Reads', x = 'Library', fill = 'Category')

We can also examine the read metrics side by side for the junction 1 / 2 libraries (aka down and up).

df_read_nearest_up_down %>% 
  mutate(status = paste0('off_',off_target, '_p_',perfect)) %>% 
  mutate(status = fct_relevel(status, 'off_TRUE_p_FALSE','off_FALSE_p_FALSE','off_FALSE_p_TRUE')) %>% 
  ggplot(aes(x = up_down, fill = status)) + 
    geom_bar(color = 'black',position = 'fill') +
  facet_wrap(~lib, ncol = 4)+
    scale_y_continuous(labels = scales::label_percent())+
    scale_fill_viridis_d(labels = c('Off target / Not Perfect','On Target / Not Perfect','On Target / Perfect'), option = 'A')  + 
    labs(y = 'Reads', x = 'Library', fill = 'Category')+
  theme(legend.position = 'bottom')

TO centric analysis

Instead of taking a “read-centric” view, we can instead take a “target-centric” view and ask how many of the TOs we designed actually yielded mutants in our libraries. First we will count up all the reads for each gene_id.

We will find which TOs were “found,” meaning that they have at least 1 perfect read in both the junction 1 and junction 2 libraries.

#Must have at least 1 perfect read
df_read_nearest_all_summary_down <- df_read_nearest_down %>% 
  filter(min_val == 0) %>% 
  group_by(lib, gene_id) %>% 
  summarise( n=n(), min_val = min(min_val, na.rm = T))

df_read_nearest_all_summary_up <- df_read_nearest_up %>% 
  filter(min_val == 0) %>% 
  group_by(lib, gene_id) %>% 
  summarise( n=n(), min_val = min(min_val, na.rm = T))

df_TOs_found_down <- left_join(df_TOs, df_read_nearest_all_summary_down ,by = c('gene'='gene_id', 'lib')) %>% 
  mutate(found_down = ifelse(is.na(n) & is.na(min_val), F, T)) 

df_TOs_found_up_down <- left_join(df_TOs_found_down, df_read_nearest_all_summary_up, suffix = c('_down','_up') ,by = c('gene'='gene_id', 'lib'))%>% 
  mutate(found_up = ifelse(is.na(n_up) & is.na(min_val_up), F, T)) %>% 
  mutate(found_score = found_down + found_up)

#write_csv(df_TOs_found_up_down, "df_TOs_found_up_down.csv")

df_TOs_found_up_down %>% select(-left_oligo_pos, -right_oligo_pos, -oligo,-product,-go_term)

## # A tibble: 420 × 9
##      lib gene  n_down min_val_down found_down  n_up min_val_up found_up found_…¹
##    <dbl> <chr>  <int>        <dbl> <lgl>      <int>      <dbl> <lgl>       <int>
##  1     1 alpA    1622            0 TRUE        1906          0 TRUE            2
##  2     1 argR     170            0 TRUE         283          0 TRUE            2
##  3     1 ariR     115            0 TRUE          34          0 TRUE            2
##  4     1 arsR     131            0 TRUE         422          0 TRUE            2
##  5     1 asnC     550            0 TRUE         429          0 TRUE            2
##  6     1 bolA     969            0 TRUE        1745          0 TRUE            2
##  7     1 caiF     789            0 TRUE        3308          0 TRUE            2
##  8     1 cedA     195            0 TRUE         407          0 TRUE            2
##  9     1 cspA      80            0 TRUE         187          0 TRUE            2
## 10     1 cspC     339            0 TRUE        1137          0 TRUE            2
## # … with 410 more rows, and abbreviated variable name ¹found_score

From this dataset, we can calculate how many of each designed TOs were found represented in the sequenced mutant libraries, with at least a perfect read in both junction 1 and junction 2 (found score = 2).

df_TOs_found_up_down %>% 
  group_by(lib, found_score) %>% 
  summarise(n=n()) %>%  
  pivot_wider(names_from = found_score, names_prefix = 'found_', values_from = n,values_fill = 0) %>% 
  mutate(frac_found_2 = found_2 / (found_2 + found_1 + found_0))

## # A tibble: 4 × 5
## # Groups:   lib [4]
##     lib found_2 found_0 found_1 frac_found_2
##   <dbl>   <int>   <int>   <int>        <dbl>
## 1     0      27       0       0        1    
## 2     1      69       4       1        0.932
## 3     2     205      18       3        0.907
## 4     3      83       5       5        0.892

Overall >89% of TOs were found for each library.

We can also express this in plot form:

df_TOs_found_up_down %>% 
  ggplot(aes(x = lib, fill = factor(found_score))) + geom_bar(position = 'fill', color = 'black')+
  scale_fill_viridis_d( option = 'A') + 
  scale_y_continuous(labels = scales::label_percent())

The plot in Figure 8 expresses this same data as absolute numbers of TOs in each category (not percentage):

df_TOs_found_up_down %>% 
  ggplot(aes(x = factor(lib), fill = factor(found_score))) + geom_bar(position = 'stack', color = 'black')+
  scale_fill_viridis_d( option = 'A', labels = c('Absent','Present in Up or Dn','Present in both')) + 
  scale_x_discrete(breaks =c('0','1','2','3'),labels = c('#0 oPool','#1 short TFs','#2 long TFs','#3 small RNAs'))+
  labs(fill = 'Mutations found', x = "Mutant library", y = '# Designed targeting oligos')

Classifying genes as essential

Although most designed mutations were observed, some were not. One reason why that might be is a mutation cannot be made, because the gene is essential. To test this, we can first download the essential genes lists from Ecocyc. There are three different datasets for LB, the medium used in our experiments.

df_lb_1 <- read_delim("../../data/seq_data/318-single-gene-knockouts-exhibit-no-growth-on-LB-Lennox.txt",delim = '\t') %>% 
  mutate(media = 'lennox')

df_lb_2 <- read_delim("../../data/seq_data/358-single-gene-knockouts-exhibit-no-growth-on-Luria-Bertani-broth.txt",delim = '\t') %>% 
  mutate(media = 'luria')

df_lb_3 <- read_delim("../../data/seq_data/614-single-gene-knockouts-exhibit-no-growth-on-LB-enriched.txt",delim = '\t') %>% 
  mutate(media = 'enriched')

df_lb_essential <- bind_rows(df_lb_1, df_lb_2,df_lb_3) %>% 
  mutate(gene=`All-Genes`) %>% 
  select(-`All-Genes`) %>% 
  mutate(essential = T) %>% 
  pivot_wider(names_from = media, names_prefix = "essential_",values_from = essential, values_fill = FALSE) 

df_lb_essential

## # A tibble: 759 × 4
##    gene  essential_lennox essential_luria essential_enriched
##    <chr> <lgl>            <lgl>           <lgl>             
##  1 alaS  TRUE             TRUE            TRUE              
##  2 btuB  TRUE             FALSE           FALSE             
##  3 coaA  TRUE             TRUE            TRUE              
##  4 coaE  TRUE             TRUE            TRUE              
##  5 djlB  TRUE             FALSE           FALSE             
##  6 dnaG  TRUE             TRUE            TRUE              
##  7 folP  TRUE             TRUE            FALSE             
##  8 glmM  TRUE             TRUE            FALSE             
##  9 glyS  TRUE             TRUE            TRUE              
## 10 groL  TRUE             TRUE            TRUE              
## # … with 749 more rows

Above, we combined these datasets, and now we can merge this information with our information about which TOs were found.

df_TOs_found_essential <- left_join( df_TOs_found_up_down, df_lb_essential, by = 'gene')%>% 
  mutate(essential = ifelse(is.na(essential_lennox),F,T)) %>% 
  mutate(essential_score = essential_lennox + essential_luria + essential_enriched) %>% 
  mutate(essential_score = ifelse(is.na(essential_score), 0, essential_score))

df_TOs_found_essential%>% select(-left_oligo_pos, -right_oligo_pos, -oligo,-product,-go_term, -n_down, -min_val_down, -found_down, -n_up, -min_val_up)

## # A tibble: 420 × 9
##      lib gene  found_up found_score essential_…¹ essen…² essen…³ essen…⁴ essen…⁵
##    <dbl> <chr> <lgl>          <int> <lgl>        <lgl>   <lgl>   <lgl>     <dbl>
##  1     1 alpA  TRUE               2 NA           NA      NA      FALSE         0
##  2     1 argR  TRUE               2 FALSE        FALSE   TRUE    TRUE          1
##  3     1 ariR  TRUE               2 NA           NA      NA      FALSE         0
##  4     1 arsR  TRUE               2 NA           NA      NA      FALSE         0
##  5     1 asnC  TRUE               2 NA           NA      NA      FALSE         0
##  6     1 bolA  TRUE               2 NA           NA      NA      FALSE         0
##  7     1 caiF  TRUE               2 NA           NA      NA      FALSE         0
##  8     1 cedA  TRUE               2 NA           NA      NA      FALSE         0
##  9     1 cspA  TRUE               2 NA           NA      NA      FALSE         0
## 10     1 cspC  TRUE               2 NA           NA      NA      FALSE         0
## # … with 410 more rows, and abbreviated variable names ¹essential_lennox,
## #   ²essential_luria, ³essential_enriched, ⁴essential, ⁵essential_score

We can assume that the more times a gene shows up as essential in the three datasets, the more likely it truly is to be essential. So we can use the 0-3 essential score to assess our confidence in the essentiality. Let’s look at this data graphically.

ggplot(df_TOs_found_essential %>% mutate(essential_score = factor(essential_score, levels = c('3','2','1','0'))), 
       aes(x =factor(found_score),fill = essential_score)) + 
  geom_bar(color = 'black',position = 'fill') + 
  scale_fill_viridis_d(option = 'A', labels = c('Essential (3x)','Possibly essential (2x)','Possibly essential (1x)','Not essential'))+
  scale_x_discrete(labels = c('Absent','Present in Up or Dn','Present in both'))+
  scale_y_continuous(labels = scales::label_percent())+
  labs(x = 'Mutation category', y = '# Designed targeting oligos')

So, this is expected that TOs that were absent (not found), are more likely to be essential genes. Interestingly, some of the TOs not found are not predicted to be essential. Let’s examine them manually to see if there is any pattern:

df_TOs_found_essential %>% filter(found_score == 0 & essential_score == 0) %>% select(lib, gene, found_score, essential_score,product)

## # A tibble: 13 × 5
##      lib gene  found_score essential_score product                              
##    <dbl> <chr>       <int>           <dbl> <chr>                                
##  1     2 arcA            0               0 ArcA-P<sup>asp54</sup>-phosphate // …
##  2     2 dcuR            0               0 DcuR // DNA-binding transcriptional …
##  3     2 fnr             0               0 DNA-binding transcriptional dual reg…
##  4     2 gntR            0               0 DNA-binding transcriptional represso…
##  5     2 metR            0               0 DNA-binding transcriptional dual reg…
##  6     2 sgrR            0               0 DNA-binding transcriptional dual reg…
##  7     2 uvrY            0               0 Phosphorylated DNA-binding transcrip…
##  8     2 yheO            0               0 DNA-binding transcriptional regulato…
##  9     3 ffs             0               0 signal recognition particle 4.5S RNA 
## 10     3 ohsC            0               0 small regulatory RNA OhsC            
## 11     3 orzP            0               0 small RNA OrzP                       
## 12     3 sroE            0               0 small RNA SroE                       
## 13     3 tff             0               0 putative small RNA T44

There are 13 genes / TOs that fall into this category. Here are some manual observations from looking on Ecocyc:

gene	lib	Essential or perturbing?	comments
arcA	2	no	no essential genes nearby. Regulates important metabolic genes, could cause growth defect, but not essential.
dcuR	2	no	no essential genes nearby. Regulates C4 usage, shouldn’t be essential in LB.
fnr	2	no	no essential genes nearby. regulates many genes in anaerobic metabolism. Not likely to be essential. Fnr mutant successfully made in IDT library. (n=50)
gntR	2	no	no essential genes nearby. Regulates gluconate.
metR	2	no	metR is essential on minimal media because it controls methionine biosynthesis. No LB essential genes nearby.
sgrR	2	no	Involved in sugar transport regulation and adjacent to thiamine transporter. Should not be essential.
uvrY	2	no	upstream of uvrC then pgsA (which is completely essential 3/3). Seems possible that uvrY ORBIT disrupts regulation of pgsA, but will not classify as perturbing, since it is not the adjacent gene.
yheO	2	Perturbing	yheO is part of the tusBCD operon, which makes an important modification to tRNAs. tus mutants have severe phenotypes and have essential score of 1/3. The next gene downstream is rpsL (ribosomal protein), which is totally essential (3/3). Seems possible this mutant disrupts these downstream genes.
ffs	3	Essential	signal recognition particle 4.5s RNA - definitely essential.
ohsC	3	Essential	negatively regulates peptide toxin, shoB nearby in genome.
orzP	3	no	In other E. coli strains it negatively regulates toxic peptide zorP. ZorP doesn’t seem to be present in K12.
sroA	3	no	immediately adjacent to sgrR, which also failed in ORBIT and thi genes (thiamine) - overlaps with riboswitch.
sroE	3	Perturbing	Directly in between two clearly essential genes. Not clear that this sRNA plays any important role, but seems likely that ORBIT truly disrupted essential locus.
tff	3	Perturbing	Immediately upstream of essential ribosomal subunit. Unclear if sRNA actually plays any role.

So, with this information, we can manually update the four genes that are likely perturbing or essential. We can also tweak our classification scheme to be 3 levels - not essential, porbably essential (essential score = 0) / perturbing (manual or essential score = 1-2), or essential (manual or essential score = 3).

df_TOs_found_essential_updated <- df_TOs_found_essential %>% 
  mutate(essential = ifelse(essential_score == 3, 'essential',essential)) %>% 
  mutate(essential = ifelse(essential_score %in% c(1,2), 'probable essential / perturbing',essential)) %>% 
  mutate(essential = ifelse(essential_score == 0, 'not essential',essential)) %>% 
  mutate(essential  = ifelse(gene %in% c('yheO','ffs','ohsC','sroE','tff'), 'probable essential / perturbing', essential))

#write_csv(df_TOs_found_essential_updated, "../../data/high_throughput_experiments/df_TOs_found_essential_updated.csv")


df_TOs_found_essential_updated %>% select(lib, gene, found_score, essential, essential_score,product)

## # A tibble: 420 × 6
##      lib gene  found_score essential                       essential_s…¹ product
##    <dbl> <chr>       <int> <chr>                                   <dbl> <chr>  
##  1     1 alpA            2 not essential                               0 CP4-57…
##  2     1 argR            2 probable essential / perturbing             1 DNA-bi…
##  3     1 ariR            2 not essential                               0 putati…
##  4     1 arsR            2 not essential                               0 DNA-bi…
##  5     1 asnC            2 not essential                               0 DNA-bi…
##  6     1 bolA            2 not essential                               0 DNA-bi…
##  7     1 caiF            2 not essential                               0 DNA-bi…
##  8     1 cedA            2 not essential                               0 cell d…
##  9     1 cspA            2 not essential                               0 cold s…
## 10     1 cspC            2 not essential                               0 stress…
## # … with 410 more rows, and abbreviated variable name ¹essential_score

Finally, we can express this graphically as a percentage or absolute number:

ggplot(df_TOs_found_essential_updated %>% mutate(essential = factor(essential, levels = c('essential','probable essential / perturbing','not essential'))), 
       aes(x =factor(found_score),fill = essential)) + 
  geom_bar(color = 'black',position = 'fill') + 
  scale_fill_viridis_d(option = 'A')+ #labels = c('Essential (3x)','Possibly','Not essential')
  scale_x_discrete(labels = c('Absent','Present in Up or Dn','Present in both'))+
  scale_y_continuous(labels = scales::label_percent())+
  labs(x = 'Mutation category', y = '% Designed targeting oligos') + theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(df_TOs_found_essential_updated %>% mutate(essential = factor(essential, levels = c('essential','probable essential / perturbing','not essential'))), 
       aes(x =factor(found_score),fill = essential)) + 
  geom_bar(color = 'black',position = 'stack') + 
  scale_fill_viridis_d(option = 'A')+ #labels = c('Essential (3x)','Possibly','Not essential')
  scale_x_discrete(labels = c('Absent','Present in Up or Dn','Present in both'))+
  labs(x = 'Mutation category', y = '# Designed targeting oligos') + theme(axis.text.x = element_text(angle = 45, hjust = 1))

The final plot in Fig. 8 is simplified further to only look at completely absent TOs and just show their essentiality classification:

ggplot(df_TOs_found_essential_updated %>% filter(found_score == 0) %>% mutate(essential = factor(essential, levels = c('essential','probable essential / perturbing','not essential'))), 
       aes(x = essential)) + 
  geom_bar(fill = 'light gray')  #labels = c('Essential (3x)','Possibly','Not essential')

  #scale_x_discrete(labels = c('Absent','Present in Up or Dn','Present in both'))+
  #scale_y_continuous(labels = scales::label_percent())+
  #labs(x = 'Mutation category', y = '% Designed targeting oligos') + theme(axis.text.x = element_text(angle = 45, hjust = 1))

One follow up question we could ask is, do read counts (relative mutant abundance) follow a trend with the essentiality metric:

df_TOs_found_essential_updated %>% pivot_longer(c(n_down, n_up), names_to = 'up_down',values_to = 'up_down_count') %>% 
  mutate(essential = factor(essential, levels = c('essential','probable essential / perturbing','not essential'))) %>% 
  mutate(up_down_count = ifelse(is.na(up_down_count), 0, up_down_count)) %>% 
  ggplot(aes(x = factor(essential), y = up_down_count))+ 
  geom_boxplot(position = position_nudge(x = 0.2),width = 0.1)+
  geom_jitter(shape = 21, width = 0.1, alpha = 0.5,) + 
  facet_wrap(~up_down, ncol = 1)+
  scale_y_log10(oob = scales::squish_infinite, limits = c(0.1, NA))+
  labs(x = 'Mutation category', y = 'Down reads') + theme(axis.text.x = element_text(angle = 45, hjust = 1))

#df_TOs_found_essential_updated %>% filter(essential_score == 3)

Indeed, it does seem that essential genes show very few reads, with only a single target showing a couple of reads.

Next steps

Some of these plot show up in main Figure 8 (code/main_figs/fig_9=8_to_twist_libs.Rmd).

The read count data was also used to model efficiency in several supplemental figures (code/supp_figs/supp_twist_lib.Rmd).

sessionInfo()

## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] viridis_0.6.2     viridisLite_0.4.1 ggbeeswarm_0.7.1  ggridges_0.5.4   
##  [5] kableExtra_1.3.4  forcats_0.5.2     stringr_1.5.0     dplyr_1.1.0      
##  [9] purrr_0.3.5       readr_2.1.3       tidyr_1.2.1       tibble_3.1.8     
## [13] ggplot2_3.4.0     tidyverse_1.3.2  
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.4          sass_0.4.4          bit64_4.0.5        
##  [4] vroom_1.6.0         jsonlite_1.8.3      modelr_0.1.10      
##  [7] bslib_0.4.1         assertthat_0.2.1    highr_0.9          
## [10] vipor_0.4.5         googlesheets4_1.0.1 cellranger_1.1.0   
## [13] yaml_2.3.6          pillar_1.8.1        backports_1.4.1    
## [16] glue_1.6.2          digest_0.6.30       rvest_1.0.3        
## [19] colorspace_2.0-3    htmltools_0.5.4     pkgconfig_2.0.3    
## [22] broom_1.0.1         haven_2.5.1         scales_1.2.1       
## [25] webshot_0.5.4       svglite_2.1.0       tzdb_0.3.0         
## [28] timechange_0.1.1    googledrive_2.0.0   generics_0.1.3     
## [31] farver_2.1.1        ellipsis_0.3.2      cachem_1.0.6       
## [34] withr_2.5.0         cli_3.4.1           magrittr_2.0.3     
## [37] crayon_1.5.2        readxl_1.4.1        evaluate_0.18      
## [40] fs_1.5.2            fansi_1.0.3         xml2_1.3.3         
## [43] beeswarm_0.4.0      tools_4.2.0         hms_1.1.2          
## [46] gargle_1.2.1        lifecycle_1.0.3     munsell_0.5.0      
## [49] reprex_2.0.2        compiler_4.2.0      jquerylib_0.1.4    
## [52] systemfonts_1.0.4   rlang_1.0.6         grid_4.2.0         
## [55] rstudioapi_0.14     labeling_0.4.2      rmarkdown_2.18     
## [58] gtable_0.3.1        DBI_1.1.3           R6_2.5.1           
## [61] gridExtra_2.3       lubridate_1.9.0     knitr_1.41         
## [64] fastmap_1.1.0       bit_4.0.5           utf8_1.2.2         
## [67] stringi_1.7.8       parallel_4.2.0      vctrs_0.5.2        
## [70] dbplyr_2.2.1        tidyselect_1.2.0    xfun_0.35

Seq data analysis: Upstream and downstream TO libs

E. coli ORBIT 2022

Scott H. Saunders

Notes

Read centric analysis

TO centric analysis

Classifying genes as essential

Next steps