library(tidyverse)
library(tidysq)
library(Biostrings)

# Code display options
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=FALSE, echo = TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.retina = 2)
# Load plotting tools
source("../../tools/plotting_tools.R")
#Modify the plot theme
theme_set(theme_notebook())

Notes

Recall that for the twist deletion libraries there were three different subpools, Library #1 = short TF gene deletions, Library #2 = long TFs, and Library #3 = small RNAs. Each subpool had different primer sites attached, so that they could be amplified and processed independently before ORBIT transformations. For these sequencing libraries, each subpool was amplifified and had illumina overhangs attached for direct sequencing of the targeting oligos (before ORBIT transformation). Therefore in this notebook we will analyze those reads to figure out if they are the expected length and match to the designed constructs. We will count how many reads match perfectly to each TO construct (see python notebooks in twist_oligo_design) and then use that data to see if it explains the read abundance in the resulting ORBIT mutant libraries (Fig. 8).

Read in data

Let’s read in the fasta file of the quality filtered reads for the TO abundance experiments.

#df_fasta_1 <- read_fasta(file_name = "fastp_processed/lib_1_TOs.fasta")
df_fasta_1 <- readDNAStringSet("../../../data/seq_data/twist_TO_abundance/fastp_processed/lib_1_TOs.fasta")
df_fasta_1
## DNAStringSet object of length 329928:
##          width seq                                          names               
##      [1]   179 CTTGCGTGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
##      [2]   179 TTCTCGTGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
##      [3]   179 CCTGCATGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
##      [4]   179 TCTGTGGGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
##      [5]   179 GTGTCTCGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
##      ...   ... ...
## [329924]   179 AGGTGCGGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
## [329925]   179 GTTTCGGGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
## [329926]   179 GACCGCCGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
## [329927]   179 GATCGCGGATTGGATAAATGG...GATCGGAATACTACCCTCTT M02894:46:0000000...
## [329928]   178 TGTTTTCGTTTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
df_fasta_2 <- readDNAStringSet("../../../data/seq_data/twist_TO_abundance/fastp_processed/lib_2_TOs.fasta")
df_fasta_2
## DNAStringSet object of length 202476:
##          width seq                                          names               
##      [1]   179 CTGAGGTAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTT M02894:46:0000000...
##      [2]   179 CAGAAATAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
##      [3]   179 TTTGAGCAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
##      [4]   179 TTGACAAAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
##      [5]   179 CGCTTCGAATCTAAGACTCCG...GATCCATCTTCCAAAAGCTT M02894:46:0000000...
##      ...   ... ...
## [202472]   178 AGTTCTAATCTAAGACTCCGT...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
## [202473]   179 GCCGACAAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
## [202474]   179 TACTGATAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
## [202475]   179 AAAGAACAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
## [202476]   179 GGGGTCTAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
df_fasta_3 <- readDNAStringSet("../../../data/seq_data/twist_TO_abundance/fastp_processed/lib_3_TOs.fasta")
df_fasta_3
## DNAStringSet object of length 304789:
##          width seq                                          names               
##      [1]   179 TTCTCTTGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
##      [2]   179 CTCTCCTGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
##      [3]   179 CGGGGTAGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
##      [4]   179 CCTCGCAGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
##      [5]   178 TTCAGTTGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
##      ...   ... ...
## [304785]   179 CACCTCTGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [304786]   179 CGCTGCGGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [304787]   179 GCTCACAGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [304788]   179 CCCCTTCGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [304789]   179 CTCTCGGGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...

Evaluate read length

We expect that the full amplicon length reads should be 179 bp. For each TO library (#1-3) let’s look at the length distribution and the fraction of reads that are of the perfect length:

df_lens_1 <- tibble(lens = width(df_fasta_1)) %>% count(lens)

#df_lens_1

ggplot(df_lens_1, aes(x = lens, y = n)) + geom_col()+ scale_y_log10()

(df_lens_1 %>% filter(lens == 179))$n / sum(df_lens_1$n)
## [1] 0.9097773
df_lens_2 <- tibble(lens = width(df_fasta_2)) %>% count(lens)

#df_lens_1

ggplot(df_lens_2, aes(x = lens, y = n)) +geom_col() + scale_y_log10()

(df_lens_2 %>% filter(lens == 179))$n / sum(df_lens_2$n)
## [1] 0.9122612
df_lens_3 <- tibble(lens = width(df_fasta_3)) %>% count(lens)

#df_lens_1

ggplot(df_lens_3, aes(x = lens, y = n)) +geom_col()+ scale_y_log10()

(df_lens_3 %>% filter(lens == 179))$n / sum(df_lens_3$n)
## [1] 0.9258667

So the dominant peak is at the correct length.

Count perfect TO matches

Let’s now read in our TO sequences that we designed.

df_ao_short <- read_csv("../../twist_oligo_design/twist_orbit_tf_del_AO_short.csv")

df_ao_long <- read_csv("../../twist_oligo_design/twist_orbit_tf_del_AO_long.csv")

df_sRNA <- read_csv("../../twist_oligo_design/twist_orbit_small_RNA.csv")

df_twist <- bind_rows(df_ao_short %>% mutate(lib = 1) %>%
                      mutate(left_oligo_pos = left_avd_ovlp, right_oligo_pos = right_avd_ovlp),
                    df_ao_long %>% mutate(lib = 2)%>%
                      mutate(left_oligo_pos = left_avd_ovlp, right_oligo_pos = right_avd_ovlp),
                    df_sRNA %>% mutate(lib = 3) %>% mutate(gene=`Gene Name`))

df_twist %>% select(lib, gene, oligo)
## # A tibble: 393 Ă— 3
##      lib gene  oligo                                                            
##    <dbl> <chr> <chr>                                                            
##  1     1 alpA  CTCTCTGCAACCAAAGTGAACCAATGAGAGGCAACAAGAATGAACggcttgtcgacgacggcgg…
##  2     1 argR  GCACAATAATGTTGTATCAACCACCATATCGGGTGACTTATGCGAggcttgtcgacgacggcgg…
##  3     1 ariR  AAACTTATACTTAATAATTAGAAGTTACATATCATCAGCTGTGTAggcttgtcgacgacggcgg…
##  4     1 arsR  TCGAAGAGAGACACTACCTGCAACAATCAGGAGCGCAATATGTCAggcttgtcgacgacggcgg…
##  5     1 asnC  TAAAATAGAATGAATCATCAATCCGCATAAGAAAATCCTATGGAAggcttgtcgacgacggcgg…
##  6     1 bolA  GCGCCGACAGTTGCAAATGCGTTTTTACGCGATGCTTCCTGCTCCggcttgtcgacgacggcgg…
##  7     1 caiF  ACGCCCGGTATCAACGAGATAAAATTAACGACGCATACTCTTTGAggcttgtcgacgacggcgg…
##  8     1 cedA  GCAAAACCGGGAGGTATGTAATCCTTACTCAGTCACTTCCCCTTCggcttgtcgacgacggcgg…
##  9     1 cspA  TCGCCGAAAGGCACACTTAATTATTAAAGGTAATACACTATGTCCggcttgtcgacgacggcgg…
## 10     1 cspC  CACACTTCAGATCAGTGGATTCGATCAGATAGCTGTTACGTTAACggcttgtcgacgacggcgg…
## # … with 383 more rows

Now, for each library we can match reads to our designed oligos and count how many times they occur. We will do this with the function vcountPDict() from the biostrings package. We will only consider perfect matches, but this code can be easily adapted for mismatches and indels and it did not change the results.

Lib 1

lib_1_TOs <- DNAStringSet((df_twist %>% filter(lib==1))$oligo)

pdict <- PDict(lib_1_TOs)

lib_1_counts <- vcountPDict(pdict, df_fasta_1,collapse = 1)

lib_1_counts_genes <- bind_cols(df_twist %>% filter(lib==1), 'TO_reads'=lib_1_counts)

#sum(lib_1_counts)

sum(lib_1_counts) / length(df_fasta_1) # 90% of reads perfectly match TOs. 
## [1] 0.9010724
ggplot(lib_1_counts_genes, aes(x = gene, y =TO_reads )) + geom_point() + ylim(0,NA)

ggplot(lib_1_counts_genes, aes(y=TO_reads )) + geom_boxplot() + ylim(0,NA)

Lib 2

lib_2_TOs <- DNAStringSet((df_twist %>% filter(lib==2))$oligo)

pdict <- PDict(lib_2_TOs)


lib_2_counts <- vcountPDict(pdict, df_fasta_2,collapse = 1)

lib_2_counts_genes <- bind_cols(df_twist %>% filter(lib==2), 'TO_reads'=lib_2_counts)

#sum(lib_1_counts)

sum(lib_2_counts) / length(df_fasta_2) # 90% of reads perfectly match TOs. 
## [1] 0.8918045
ggplot(lib_2_counts_genes, aes(x = gene, y =TO_reads )) + geom_point() + ylim(0,NA)

ggplot(lib_2_counts_genes, aes(y=TO_reads )) + geom_boxplot() + ylim(0,NA)

Lib 3

lib_3_TOs <- DNAStringSet((df_twist %>% filter(lib==3))$oligo)

pdict <- PDict(lib_3_TOs)


lib_3_counts <- vcountPDict(pdict, df_fasta_3,collapse = 1)

lib_3_counts_genes <- bind_cols(df_twist %>% filter(lib==3), 'TO_reads'=lib_3_counts)

#sum(lib_1_counts)

sum(lib_3_counts) / length(df_fasta_3) # 90% of reads perfectly match TOs. 
## [1] 0.9177136
ggplot(lib_3_counts_genes, aes(x = gene, y =TO_reads )) + geom_point() + ylim(0,NA)

ggplot(lib_3_counts_genes, aes(y=TO_reads )) + geom_boxplot() + ylim(0,NA)

*note for Library #3 ncRNA 2-5 are extremely similar. In fact ncRNA2 and ncRNA5 are identical. ncRNA3 and ncRNA 4 only differ from 2/5 and each other by 1 nt.

Conclusion: For each library, approximately 90% of the reads match perfectly to a designed TO construct. The percentage increases slightly to ~95% when mismatches and indels are included, but using perfect matches seems like a reasonable set to proceed with.

Combined TO abundance dataset

Finally, let’s save these perfect match counts as a csv for use in Fig. 8 and supplementary figures.

df_TO_abundances <- bind_rows(lib_1_counts_genes,lib_2_counts_genes,lib_3_counts_genes)

write_csv(df_TO_abundances, "df_TO_abundances.csv")

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] viridis_0.6.2       viridisLite_0.4.1   Biostrings_2.66.0  
##  [4] GenomeInfoDb_1.34.4 XVector_0.38.0      IRanges_2.32.0     
##  [7] S4Vectors_0.36.0    BiocGenerics_0.44.0 tidysq_1.1.3       
## [10] forcats_0.5.2       stringr_1.5.0       dplyr_1.1.0        
## [13] purrr_0.3.5         readr_2.1.3         tidyr_1.2.1        
## [16] tibble_3.1.8        ggplot2_3.4.0       tidyverse_1.3.2    
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7           fs_1.5.2               lubridate_1.9.0       
##  [4] bit64_4.0.5            httr_1.4.4             tools_4.2.0           
##  [7] backports_1.4.1        bslib_0.4.1            utf8_1.2.2            
## [10] R6_2.5.1               DBI_1.1.3              colorspace_2.0-3      
## [13] withr_2.5.0            tidyselect_1.2.0       gridExtra_2.3         
## [16] bit_4.0.5              compiler_4.2.0         cli_3.4.1             
## [19] rvest_1.0.3            xml2_1.3.3             labeling_0.4.2        
## [22] sass_0.4.4             scales_1.2.1           checkmate_2.1.0       
## [25] digest_0.6.30          rmarkdown_2.18         pkgconfig_2.0.3       
## [28] htmltools_0.5.4        dbplyr_2.2.1           fastmap_1.1.0         
## [31] highr_0.9              rlang_1.0.6            readxl_1.4.1          
## [34] rstudioapi_0.14        jquerylib_0.1.4        generics_0.1.3        
## [37] farver_2.1.1           jsonlite_1.8.3         vroom_1.6.0           
## [40] googlesheets4_1.0.1    RCurl_1.98-1.9         magrittr_2.0.3        
## [43] GenomeInfoDbData_1.2.9 Rcpp_1.0.9             munsell_0.5.0         
## [46] fansi_1.0.3            lifecycle_1.0.3        stringi_1.7.8         
## [49] yaml_2.3.6             zlibbioc_1.44.0        grid_4.2.0            
## [52] parallel_4.2.0         crayon_1.5.2           haven_2.5.1           
## [55] hms_1.1.2              knitr_1.41             pillar_1.8.1          
## [58] reprex_2.0.2           glue_1.6.2             evaluate_0.18         
## [61] modelr_0.1.10          vctrs_0.5.2            tzdb_0.3.0            
## [64] cellranger_1.1.0       gtable_0.3.1           assertthat_0.2.1      
## [67] cachem_1.0.6           xfun_0.35              broom_1.0.1           
## [70] googledrive_2.0.0      gargle_1.2.1           timechange_0.1.1      
## [73] ellipsis_0.3.2