library(tidyverse)
library(tidysq)
library(Biostrings)
# Code display options
::opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=FALSE, echo = TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.retina = 2)
knitr# Load plotting tools
source("../../tools/plotting_tools.R")
#Modify the plot theme
theme_set(theme_notebook())
Recall that for the twist deletion libraries there were three
different subpools, Library #1 = short TF gene deletions, Library #2 =
long TFs, and Library #3 = small RNAs. Each subpool had different primer
sites attached, so that they could be amplified and processed
independently before ORBIT transformations. For these sequencing
libraries, each subpool was amplifified and had illumina overhangs
attached for direct sequencing of the targeting oligos (before ORBIT
transformation). Therefore in this notebook we will analyze those reads
to figure out if they are the expected length and match to the designed
constructs. We will count how many reads match perfectly to each TO
construct (see python notebooks in twist_oligo_design
) and
then use that data to see if it explains the read abundance in the
resulting ORBIT mutant libraries (Fig. 8).
Let’s read in the fasta file of the quality filtered reads for the TO abundance experiments.
#df_fasta_1 <- read_fasta(file_name = "fastp_processed/lib_1_TOs.fasta")
<- readDNAStringSet("../../../data/seq_data/twist_TO_abundance/fastp_processed/lib_1_TOs.fasta")
df_fasta_1 df_fasta_1
## DNAStringSet object of length 329928:
## width seq names
## [1] 179 CTTGCGTGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
## [2] 179 TTCTCGTGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
## [3] 179 CCTGCATGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
## [4] 179 TCTGTGGGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
## [5] 179 GTGTCTCGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
## ... ... ...
## [329924] 179 AGGTGCGGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
## [329925] 179 GTTTCGGGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
## [329926] 179 GACCGCCGATTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
## [329927] 179 GATCGCGGATTGGATAAATGG...GATCGGAATACTACCCTCTT M02894:46:0000000...
## [329928] 178 TGTTTTCGTTTGGATAAATGG...GATCGGAATACGACCCTCTT M02894:46:0000000...
<- readDNAStringSet("../../../data/seq_data/twist_TO_abundance/fastp_processed/lib_2_TOs.fasta")
df_fasta_2 df_fasta_2
## DNAStringSet object of length 202476:
## width seq names
## [1] 179 CTGAGGTAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTT M02894:46:0000000...
## [2] 179 CAGAAATAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
## [3] 179 TTTGAGCAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
## [4] 179 TTGACAAAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
## [5] 179 CGCTTCGAATCTAAGACTCCG...GATCCATCTTCCAAAAGCTT M02894:46:0000000...
## ... ... ...
## [202472] 178 AGTTCTAATCTAAGACTCCGT...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
## [202473] 179 GCCGACAAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
## [202474] 179 TACTGATAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
## [202475] 179 AAAGAACAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
## [202476] 179 GGGGTCTAATCTAAGACTCCG...GATCCATCGTCCAAAAGCTG M02894:46:0000000...
<- readDNAStringSet("../../../data/seq_data/twist_TO_abundance/fastp_processed/lib_3_TOs.fasta")
df_fasta_3 df_fasta_3
## DNAStringSet object of length 304789:
## width seq names
## [1] 179 TTCTCTTGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [2] 179 CTCTCCTGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [3] 179 CGGGGTAGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [4] 179 CCTCGCAGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [5] 178 TTCAGTTGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## ... ... ...
## [304785] 179 CACCTCTGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [304786] 179 CGCTGCGGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [304787] 179 GCTCACAGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [304788] 179 CCCCTTCGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
## [304789] 179 CTCTCGGGGACCTAGCATCAA...GATCAATTCCGTGGGGCTTT M02894:46:0000000...
We expect that the full amplicon length reads should be 179 bp. For each TO library (#1-3) let’s look at the length distribution and the fraction of reads that are of the perfect length:
<- tibble(lens = width(df_fasta_1)) %>% count(lens)
df_lens_1
#df_lens_1
ggplot(df_lens_1, aes(x = lens, y = n)) + geom_col()+ scale_y_log10()
%>% filter(lens == 179))$n / sum(df_lens_1$n) (df_lens_1
## [1] 0.9097773
<- tibble(lens = width(df_fasta_2)) %>% count(lens)
df_lens_2
#df_lens_1
ggplot(df_lens_2, aes(x = lens, y = n)) +geom_col() + scale_y_log10()
%>% filter(lens == 179))$n / sum(df_lens_2$n) (df_lens_2
## [1] 0.9122612
<- tibble(lens = width(df_fasta_3)) %>% count(lens)
df_lens_3
#df_lens_1
ggplot(df_lens_3, aes(x = lens, y = n)) +geom_col()+ scale_y_log10()
%>% filter(lens == 179))$n / sum(df_lens_3$n) (df_lens_3
## [1] 0.9258667
So the dominant peak is at the correct length.
Let’s now read in our TO sequences that we designed.
<- read_csv("../../twist_oligo_design/twist_orbit_tf_del_AO_short.csv")
df_ao_short
<- read_csv("../../twist_oligo_design/twist_orbit_tf_del_AO_long.csv")
df_ao_long
<- read_csv("../../twist_oligo_design/twist_orbit_small_RNA.csv")
df_sRNA
<- bind_rows(df_ao_short %>% mutate(lib = 1) %>%
df_twist mutate(left_oligo_pos = left_avd_ovlp, right_oligo_pos = right_avd_ovlp),
%>% mutate(lib = 2)%>%
df_ao_long mutate(left_oligo_pos = left_avd_ovlp, right_oligo_pos = right_avd_ovlp),
%>% mutate(lib = 3) %>% mutate(gene=`Gene Name`))
df_sRNA
%>% select(lib, gene, oligo) df_twist
## # A tibble: 393 Ă— 3
## lib gene oligo
## <dbl> <chr> <chr>
## 1 1 alpA CTCTCTGCAACCAAAGTGAACCAATGAGAGGCAACAAGAATGAACggcttgtcgacgacggcgg…
## 2 1 argR GCACAATAATGTTGTATCAACCACCATATCGGGTGACTTATGCGAggcttgtcgacgacggcgg…
## 3 1 ariR AAACTTATACTTAATAATTAGAAGTTACATATCATCAGCTGTGTAggcttgtcgacgacggcgg…
## 4 1 arsR TCGAAGAGAGACACTACCTGCAACAATCAGGAGCGCAATATGTCAggcttgtcgacgacggcgg…
## 5 1 asnC TAAAATAGAATGAATCATCAATCCGCATAAGAAAATCCTATGGAAggcttgtcgacgacggcgg…
## 6 1 bolA GCGCCGACAGTTGCAAATGCGTTTTTACGCGATGCTTCCTGCTCCggcttgtcgacgacggcgg…
## 7 1 caiF ACGCCCGGTATCAACGAGATAAAATTAACGACGCATACTCTTTGAggcttgtcgacgacggcgg…
## 8 1 cedA GCAAAACCGGGAGGTATGTAATCCTTACTCAGTCACTTCCCCTTCggcttgtcgacgacggcgg…
## 9 1 cspA TCGCCGAAAGGCACACTTAATTATTAAAGGTAATACACTATGTCCggcttgtcgacgacggcgg…
## 10 1 cspC CACACTTCAGATCAGTGGATTCGATCAGATAGCTGTTACGTTAACggcttgtcgacgacggcgg…
## # … with 383 more rows
Now, for each library we can match reads to our designed oligos and
count how many times they occur. We will do this with the function
vcountPDict()
from the biostrings package. We will only
consider perfect matches, but this code can be easily adapted for
mismatches and indels and it did not change the results.
<- DNAStringSet((df_twist %>% filter(lib==1))$oligo)
lib_1_TOs
<- PDict(lib_1_TOs)
pdict
<- vcountPDict(pdict, df_fasta_1,collapse = 1)
lib_1_counts
<- bind_cols(df_twist %>% filter(lib==1), 'TO_reads'=lib_1_counts)
lib_1_counts_genes
#sum(lib_1_counts)
sum(lib_1_counts) / length(df_fasta_1) # 90% of reads perfectly match TOs.
## [1] 0.9010724
ggplot(lib_1_counts_genes, aes(x = gene, y =TO_reads )) + geom_point() + ylim(0,NA)
ggplot(lib_1_counts_genes, aes(y=TO_reads )) + geom_boxplot() + ylim(0,NA)
<- DNAStringSet((df_twist %>% filter(lib==2))$oligo)
lib_2_TOs
<- PDict(lib_2_TOs)
pdict
<- vcountPDict(pdict, df_fasta_2,collapse = 1)
lib_2_counts
<- bind_cols(df_twist %>% filter(lib==2), 'TO_reads'=lib_2_counts)
lib_2_counts_genes
#sum(lib_1_counts)
sum(lib_2_counts) / length(df_fasta_2) # 90% of reads perfectly match TOs.
## [1] 0.8918045
ggplot(lib_2_counts_genes, aes(x = gene, y =TO_reads )) + geom_point() + ylim(0,NA)
ggplot(lib_2_counts_genes, aes(y=TO_reads )) + geom_boxplot() + ylim(0,NA)
<- DNAStringSet((df_twist %>% filter(lib==3))$oligo)
lib_3_TOs
<- PDict(lib_3_TOs)
pdict
<- vcountPDict(pdict, df_fasta_3,collapse = 1)
lib_3_counts
<- bind_cols(df_twist %>% filter(lib==3), 'TO_reads'=lib_3_counts)
lib_3_counts_genes
#sum(lib_1_counts)
sum(lib_3_counts) / length(df_fasta_3) # 90% of reads perfectly match TOs.
## [1] 0.9177136
ggplot(lib_3_counts_genes, aes(x = gene, y =TO_reads )) + geom_point() + ylim(0,NA)
ggplot(lib_3_counts_genes, aes(y=TO_reads )) + geom_boxplot() + ylim(0,NA)
*note for Library #3 ncRNA 2-5 are extremely similar. In fact ncRNA2 and ncRNA5 are identical. ncRNA3 and ncRNA 4 only differ from 2/5 and each other by 1 nt.
Conclusion: For each library, approximately 90% of the reads match perfectly to a designed TO construct. The percentage increases slightly to ~95% when mismatches and indels are included, but using perfect matches seems like a reasonable set to proceed with.
Finally, let’s save these perfect match counts as a csv for use in Fig. 8 and supplementary figures.
<- bind_rows(lib_1_counts_genes,lib_2_counts_genes,lib_3_counts_genes)
df_TO_abundances
write_csv(df_TO_abundances, "df_TO_abundances.csv")
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] viridis_0.6.2 viridisLite_0.4.1 Biostrings_2.66.0
## [4] GenomeInfoDb_1.34.4 XVector_0.38.0 IRanges_2.32.0
## [7] S4Vectors_0.36.0 BiocGenerics_0.44.0 tidysq_1.1.3
## [10] forcats_0.5.2 stringr_1.5.0 dplyr_1.1.0
## [13] purrr_0.3.5 readr_2.1.3 tidyr_1.2.1
## [16] tibble_3.1.8 ggplot2_3.4.0 tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 fs_1.5.2 lubridate_1.9.0
## [4] bit64_4.0.5 httr_1.4.4 tools_4.2.0
## [7] backports_1.4.1 bslib_0.4.1 utf8_1.2.2
## [10] R6_2.5.1 DBI_1.1.3 colorspace_2.0-3
## [13] withr_2.5.0 tidyselect_1.2.0 gridExtra_2.3
## [16] bit_4.0.5 compiler_4.2.0 cli_3.4.1
## [19] rvest_1.0.3 xml2_1.3.3 labeling_0.4.2
## [22] sass_0.4.4 scales_1.2.1 checkmate_2.1.0
## [25] digest_0.6.30 rmarkdown_2.18 pkgconfig_2.0.3
## [28] htmltools_0.5.4 dbplyr_2.2.1 fastmap_1.1.0
## [31] highr_0.9 rlang_1.0.6 readxl_1.4.1
## [34] rstudioapi_0.14 jquerylib_0.1.4 generics_0.1.3
## [37] farver_2.1.1 jsonlite_1.8.3 vroom_1.6.0
## [40] googlesheets4_1.0.1 RCurl_1.98-1.9 magrittr_2.0.3
## [43] GenomeInfoDbData_1.2.9 Rcpp_1.0.9 munsell_0.5.0
## [46] fansi_1.0.3 lifecycle_1.0.3 stringi_1.7.8
## [49] yaml_2.3.6 zlibbioc_1.44.0 grid_4.2.0
## [52] parallel_4.2.0 crayon_1.5.2 haven_2.5.1
## [55] hms_1.1.2 knitr_1.41 pillar_1.8.1
## [58] reprex_2.0.2 glue_1.6.2 evaluate_0.18
## [61] modelr_0.1.10 vctrs_0.5.2 tzdb_0.3.0
## [64] cellranger_1.1.0 gtable_0.3.1 assertthat_0.2.1
## [67] cachem_1.0.6 xfun_0.35 broom_1.0.1
## [70] googledrive_2.0.0 gargle_1.2.1 timechange_0.1.1
## [73] ellipsis_0.3.2