import numpy as np
import random
import string
import pandas as pd
import holoviews as hv
from holoviews import opts,dim
import Bio.Seq as Seq
import Bio.SeqIO
#from plotnine import *
import inspect

import wgregseq
%load_ext autoreload
%autoreload 2

hv.extension('bokeh')

pd.options.display.max_colwidth = 200


ori = np.mean([3923767,3923998])
ori

3923882.5


lines = inspect.getsource(wgregseq.get_replichore)
print(lines)

def get_replichore(pos, ori = 3923882.5, ter = 1590250.5 ):
    
    """
    Determine the replichore of a bacterial chromosome for a certain position. Requires origin and terminus positions. Assumes E. coli like organization.
    
    pos : int
        Genomic coordinate of interest.
    ori : float
        Genomic coordinate of the origin of replication. 
    ter : float
        Genomic coordinate of the replication terminus.
    """
    
    pos = int(pos)
    
    if((pos<0)| (pos>4641652)):
        raise TypeError("position must be within genome.")
    
    if((pos > ori) | (pos<ter)):
       rep = 1
    elif((pos<ori) & (pos>ter)):
       rep = 2
    
    return rep


print('pos 0 = ', wgregseq.get_replichore(pos = 0))
print('pos 2M = ', wgregseq.get_replichore(pos = 2000000))

pos 0 =  1
pos 2M =  2


lines = inspect.getsource(wgregseq.get_target_oligo)
print(lines)

def get_target_oligo(left_pos, right_pos, genome, homology = 90, attB_dir = '+', attB_fwd_seq = 'ggcttgtcgacgacggcggtctccgtcgtcaggatcat',  verbose = False):
    """
    Given a set of parameters, get an ORBIT oligo that targets the lagging strand. 
    Left and right positions are absolute genomic coordinates that specify the final nucleotides to keep unmodified in the genome, 
    everything in between will be replaced by attB. In other words the left position nucleotide is the final nt before attB in the oligo.
    The right position nt is the first nt after attB in the oligo.
    
    This function determines the lagging strand by calling `get_replichore()` on the left_pos.
    Typically attB_dir should be set to the same direction as the gene of interest, such that the integrating plasmid will insert with payload facing downstream.
    attB_fwd_seq can be modified, and the total homology can be modified, but should be an even number since homology arms are symmetric. 
    
    Verbose prints helpful statements for testing functionality.
    
    Parameters
    -----------------
    left_pos : int
        Left genomic coordinate of desired attB insertion. attB is added immediately after this nt.
    right_pos : int
        Right genomic coordinate of desired attB insertion. attB is added immediately before this nt.
    genome : str
        Genome as a string.
    homology : int (even)
        Total homology length desired for oligo. Arm length = homology / 2.
    attB_dir : chr ('+' or '-')
        Desired direction of attB  based on genomic strand. Typically same direction as gene.
    attB_fwd_seq : str
        Sequence of attB to insert between homology arms.
    verbose : bool
        If true, prints details about genomic positions and replichore.
    Returns
    ---------------
    oligo : str
        Targeting oligo against lagging strand, including the attB sequence in the correct orientation.
    """
    
    left_pos = int(left_pos)
    
    right_pos = int(right_pos)
    
    # Arm length is 1/2 total homology. Arms are symmetric
    arm_len = int(homology / 2)
    
    # Arms from genome string. Note 0 indexing of string vs. 1 indexing of genomic coordinates.
    # As written, should be inclusive.
    left_arm = genome[(left_pos - arm_len):left_pos]
    
    right_arm = genome[(right_pos - 1):(right_pos - 1 + arm_len)]

    # Generate attB reverse sequence
    seq_attB = Seq(attB_fwd_seq)
    attB_rev_seq = str(seq_attB.reverse_complement())
    
    # Replichore 1
    if get_replichore(left_pos) == 1:
        
        rep = 1
        
        # Reverse complement replichore 1 sequences.
        left_arm_seq = Seq(left_arm)
        left_arm_prime = str(left_arm_seq.reverse_complement())
        
        right_arm_seq = Seq(right_arm)
        right_arm_prime = str(right_arm_seq.reverse_complement())
        
        # Determine attB direction and paste fwd/rev seq accordingly
        if attB_dir == '+':
            
            oligo = right_arm_prime + attB_rev_seq + left_arm_prime
            
        elif attB_dir == '-':
            
            oligo = right_arm_prime + attB_fwd_seq + left_arm_prime
    
    # Replichore 2
    elif get_replichore(left_pos) == 2:
        
        rep = 2
        
        # '+' arm sequence used. Determine attB direction and paste accordingly.
        if attB_dir == '+':
            
            oligo = left_arm + attB_fwd_seq + right_arm
        
        elif attB_dir == '-':
            
            oligo = left_arm + attB_rev_seq + right_arm    
            
    # Verbose print statements
    if verbose:
        
        print('left_arm_coord = ', left_pos - arm_len,' : ', left_pos)
        print('right_arm_coord = ', right_pos - 1, ' : ', right_pos -1 + arm_len)
        print('Replichore = ', rep)
    
    return oligo


for record in Bio.SeqIO.parse('sequencev3.fasta', "fasta"):
    genome = str(record.seq)
    
print("Length genome: {}".format(len(genome)))
print("First 100 bases: {}".format(genome[:100]))

Length genome: 4641652
First 100 bases: AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAAT


wgregseq.get_target_oligo(4, 5, genome, 8, '+',' ', True)

left_arm_coord =  0  :  4
right_arm_coord =  4  :  8
Replichore =  1

'GAAA AGCT'


cbrC_start = 3898022

new_cbrC = wgregseq.get_target_oligo(3898022 + 43,3898022 + 44 + 10, genome, 90, "+", "" , True)
mod_cbrC = 'TATGACTCAAAATATCAGGCCGTTACCCCAATTCAAATATCATCCGAAACAGGCGCATTTGAACAGGATAAAACCGTAGAGTGCGATTGC'

print( 'custom cbrC: ', new_cbrC)
print( 'modest cbrC: ', mod_cbrC)
print( 'Equivalent? ', new_cbrC==mod_cbrC)

left_arm_coord =  3898020  :  3898065
right_arm_coord =  3898075  :  3898120
Replichore =  2
custom cbrC:  TATGACTCAAAATATCAGGCCGTTACCCCAATTCAAATATCATCCGAAACAGGCGCATTTGAACAGGATAAAACCGTAGAGTGCGATTGC
modest cbrC:  TATGACTCAAAATATCAGGCCGTTACCCCAATTCAAATATCATCCGAAACAGGCGCATTTGAACAGGATAAAACCGTAGAGTGCGATTGC
Equivalent?  True


asnA_start = 3927155

new_asnA = wgregseq.get_target_oligo(3927155 + 43,3927155 + 44 + 10, genome, 90, "+", "" , True)
mod_asnA = 'CTGGACTTCGATCAGCCCCAGACGTTCTTCCAGTTGACGAGAAAAACGAAGCTAATTTGACGTTGTTTGGCAATGTAAGCGGTTTTCATT'

print( 'custom asnA: ', new_asnA)
print( 'modest asnA: ', mod_asnA)
print( 'Equivalent? ', new_asnA==mod_asnA)

left_arm_coord =  3927153  :  3927198
right_arm_coord =  3927208  :  3927253
Replichore =  1
custom asnA:  CTGGACTTCGATCAGCCCCAGACGTTCTTCCAGTTGACGAGAAAAACGAAGCTAATTTGACGTTGTTTGGCAATGTAAGCGGTTTTCATT
modest asnA:  CTGGACTTCGATCAGCCCCAGACGTTCTTCCAGTTGACGAGAAAAACGAAGCTAATTTGACGTTGTTTGGCAATGTAAGCGGTTTTCATT
Equivalent?  True


cysB_start = 1333855

new_cysB = wgregseq.get_target_oligo(1333855 + 43,1333855 + 44 + 10, genome, 90, "+", "", True)
mod_cysB = 'GATCCCGGGTTGTGATGTGTAAAGTCCTTCCGCTGTTGATGAGACTGATTGACCACCTCAACAATATAGCGAAGTTGTTGTAATTTCATG'

print( 'custom cysB: ', new_cysB)
print( 'modest cysB: ', mod_cysB)
print( 'Equivalent? ', new_cysB==mod_cysB)

left_arm_coord =  1333853  :  1333898
right_arm_coord =  1333908  :  1333953
Replichore =  1
custom cysB:  GATCCCGGGTTGTGATGTGTAAAGTCCTTCCGCTGTTGATGAGACTGATTGACCACCTCAACAATATAGCGAAGTTGTTGTAATTTCATG
modest cysB:  GATCCCGGGTTGTGATGTGTAAAGTCCTTCCGCTGTTGATGAGACTGATTGACCACCTCAACAATATAGCGAAGTTGTTGTAATTTCATG
Equivalent?  True


manA_start = 1688576

new = wgregseq.get_target_oligo(1688576 + 43,1688576 + 44 + 10, genome, 90, "+", "" , True)
mod = 'CATGCAAAAACTCATTAACTCAGTGCAAAACTATGCCTGGGGCAGTTGACTGAACTTTATGGTATGGAAAATCCGTCCAGCCAGCCGATG'

print( 'custom manA: ', new)
print( 'modest manA: ', mod)
print( 'Equivalent? ', new==mod)

left_arm_coord =  1688574  :  1688619
right_arm_coord =  1688629  :  1688674
Replichore =  2
custom manA:  CATGCAAAAACTCATTAACTCAGTGCAAAACTATGCCTGGGGCAGTTGACTGAACTTTATGGTATGGAAAATCCGTCCAGCCAGCCGATG
modest manA:  CATGCAAAAACTCATTAACTCAGTGCAAAACTATGCCTGGGGCAGTTGACTGAACTTTATGGTATGGAAAATCCGTCCAGCCAGCCGATG
Equivalent?  True


rstB_start = 1682882

new = wgregseq.get_target_oligo(1682882 + 43,1682882 + 44 + 10, genome, 90, "+", "", True )
mod = 'GATGAAAAAACTGTTTATCCAGTTTTACCTGTTATTGTTTGTCTGATGTCTCTGCTGGTTGGGCTGGTGTACAAATTTACCGCCGAACGC'

print( 'custom rstB: ', new)
print( 'modest rstB: ', mod)
print( 'Equivalent? ', new==mod)

left_arm_coord =  1682880  :  1682925
right_arm_coord =  1682935  :  1682980
Replichore =  2
custom rstB:  GATGAAAAAACTGTTTATCCAGTTTTACCTGTTATTGTTTGTCTGATGTCTCTGCTGGTTGGGCTGGTGTACAAATTTACCGCCGAACGC
modest rstB:  GATGAAAAAACTGTTTATCCAGTTTTACCTGTTATTGTTTGTCTGATGTCTCTGCTGGTTGGGCTGGTGTACAAATTTACCGCCGAACGC
Equivalent?  True


lacI_mod = 'TGGCATGATAGCGCCCGGAAGAGAGTCAATTCAGGGTGGTGAATGAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTCTTATCAGAC'

lacI_oli = wgregseq.get_target_oligo(367510-11, 367510, genome , 90, "-", "", True)

print('lacI: ', lacI_oli)
print('mod : ', lacI_mod)
print('Equivalent? ', lacI_oli == lacI_mod)

left_arm_coord =  367454  :  367499
right_arm_coord =  367509  :  367554
Replichore =  1
lacI:  TGGCATGATAGCGCCCGGAAGAGAGTCAATTCAGGGTGGTGAATGAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTCTTATCAGAC
mod :  TGGCATGATAGCGCCCGGAAGAGAGTCAATTCAGGGTGGTGAATGAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTCTTATCAGAC
Equivalent?  True


ligA_mod = 'TCATGATGGCGAAGCGTCGTTCGCAGTTCTGTCAGTTGTTGTTCGTATCGCACCATCAATGCTAAAAACCCCCGACAAGCGGGGGTTCGA'
ligA_oli = wgregseq.get_target_oligo(2530176-11, 2530176, genome , 90, "-", "", True)

print('ligA: ',ligA_oli )
print('mod : ', ligA_mod)
print('Equivalent? ', ligA_oli == ligA_mod)

left_arm_coord =  2530120  :  2530165
right_arm_coord =  2530175  :  2530220
Replichore =  2
ligA:  TCATGATGGCGAAGCGTCGTTCGCAGTTCTGTCAGTTGTTGTTCGTATCGCACCATCAATGCTAAAAACCCCCGACAAGCGGGGGTTCGA
mod :  TCATGATGGCGAAGCGTCGTTCGCAGTTCTGTCAGTTGTTGTTCGTATCGCACCATCAATGCTAAAAACCCCCGACAAGCGGGGGTTCGA
Equivalent?  True


print('cbrC: ', wgregseq.get_target_oligo(cbrC_start + 2, cbrC_start + 3, genome , 90, "+", "tttt"))
print('asnA: ', wgregseq.get_target_oligo(asnA_start + 2, asnA_start + 3, genome , 90, "+", "tttt"))

cbrC:  TACTTTATCTTTGGGCTACTCAAAAGCAGACAGGATGTTTCTATGttttACTCAAAATATCAGGCCGTTACCCCAATTCAAATATCATCCCAAG
asnA:  TTTCACGAAGCTAATTTGACGTTGTTTGGCAATGTAAGCGGTTTTaaaaCATTTTTTATACTCCTGCGTCCTGTTGCTTATGATTAAGCAACAA


print('lacI: ', wgregseq.get_target_oligo(367510-3, 367510-2, genome , 90, "-", "tttt"))
print('ligA: ', wgregseq.get_target_oligo(2530176-3, 2530176-2, genome , 90, "-", "tttt"))

lacI:  GCATGATAGCGCCCGGAAGAGAGTCAATTCAGGGTGGTGAATGTGttttAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTCT
ligA:  GCGAAGCGTCGTTCGCAGTTCTGTCAGTTGTTGTTCGATTGATTCaaaaCATATCGCACCATCAATGCTAAAAACCCCCGACAAGCGGGGGTTC


print('lacI: ', wgregseq.get_target_oligo_2(367510-3, 367510-2, genome , 90, "+", "tttt", attB_lock = True))
print('ligA: ', wgregseq.get_target_oligo_2(2530176-3, 2530176-2, genome , 90, "+", "tttt", attB_lock = True))

lacI:  GCATGATAGCGCCCGGAAGAGAGTCAATTCAGGGTGGTGAATGTGttttAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTCT
ligA:  GCGAAGCGTCGTTCGCAGTTCTGTCAGTTGTTGTTCGATTGATTCttttCATATCGCACCATCAATGCTAAAAACCCCCGACAAGCGGGGGTTC


print('cbrC: ', wgregseq.get_target_oligo_2(cbrC_start + 2, cbrC_start + 3, genome , 90, "+", "tttt", attB_lock = True))
print('asnA: ', wgregseq.get_target_oligo_2(asnA_start + 2, asnA_start + 3, genome , 90, "+", "tttt", attB_lock = True))

cbrC:  TACTTTATCTTTGGGCTACTCAAAAGCAGACAGGATGTTTCTATGttttACTCAAAATATCAGGCCGTTACCCCAATTCAAATATCATCCCAAG
asnA:  TTTCACGAAGCTAATTTGACGTTGTTTGGCAATGTAAGCGGTTTTttttCATTTTTTATACTCCTGCGTCCTGTTGCTTATGATTAAGCAACAA


df = pd.read_csv("Curated_DNA_binding_transcriptional_regulators.txt", sep = '\t')

df.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)


print('Unique\n',df.nunique())

print('\nNulls\n',df.isnull().sum())

Unique
 Gene Name                        300
Product Name                     300
GO terms (molecular function)    169
Left-End-Position                300
Right-End-Position               300
Direction                          2
dtype: int64

Nulls
 Gene Name                         0
Product Name                      0
GO terms (molecular function)    10
Left-End-Position                 0
Right-End-Position                0
Direction                         0
dtype: int64


df['length'] = df['Right-End-Position'] - df['Left-End-Position']

df['length'].describe()

count     300.000000
mean      804.446667
std       429.946112
min       200.000000
25%       575.000000
50%       758.000000
75%       932.000000
max      3962.000000
Name: length, dtype: float64


scatter = hv.Scatter(df, 'Left-End-Position', 'length').opts(width = 500)*hv.HLine(575).opts(color = 'red') 
dist = hv.Distribution(df, 'length' ).opts(width = 500, bandwidth = 0.1) * hv.VLine(575).opts(color = 'red')

scatter + dist


df_short = df.loc[df['length']<=575]

df_short.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)


df_long = df.loc[df['length']>575]
df_long.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)


df_ovlp = pd.read_csv("Ecoli_overlaps.txt", sep = '\t', skiprows = 12, skipfooter=12)
df_ovlp

<ipython-input-25-16c057dcf895>:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
  df_ovlp = pd.read_csv("Ecoli_overlaps.txt", sep = '\t', skiprows = 12, skipfooter=12)


df_ovlp['gene'] = 'gene'

print(df_ovlp['Overlap'].describe())

hv.Distribution(df_ovlp, 'Overlap').opts(width = 800, height = 400)

count    1504.000000
mean        2.905585
std        27.991047
min       -26.000000
25%       -11.000000
50%         0.000000
75%         4.000000
max       263.000000
Name: Overlap, dtype: float64


df['left_codon'] = df['Left-End-Position'] + 2
df['right_codon'] = df['Right-End-Position'] - 2

df.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)


df.loc[df['Direction']=='+', 'left_avd_ovlp'] = df['Left-End-Position'] + 5
df.loc[df['Direction']=='-', 'left_avd_ovlp'] = df['Left-End-Position'] + 20

df.loc[df['Direction']=='+', 'right_avd_ovlp'] = df['Right-End-Position'] - 20
df.loc[df['Direction']=='-', 'right_avd_ovlp'] = df['Right-End-Position'] - 5

df['gene'] = df['Gene Name']

#df['right_avd_ovlp'] = df['Right-End-Position'] - 17

df.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)


(
    ggplot(df.head()) + 
    geom_segment(aes(x = 'Left-End-Position', xend = 'Right-End-Position', y = 'Gene Name', yend = 'Gene Name')) + 
    geom_point(aes(x = 'Left-End-Position', y = 'Gene Name'), shape = '|', size = 5) + 
    geom_point(aes(x = 'Right-End-Position', y = 'Gene Name'), shape = '|', size = 5)+
    geom_point(aes(x = 'left_codon', y = 'Gene Name'), color = 'red', shape = '<', size = 3, position = position_nudge(y = 0.2))+
    geom_point(aes(x = 'right_codon', y = 'Gene Name'), color = 'red', shape = '>', size = 3, position = position_nudge(y = 0.2))+
    geom_point(aes(x = 'left_avd_ovlp', y = 'Gene Name'), color = 'blue', shape = '<', size = 3, position = position_nudge(y = -0.2))+
    geom_point(aes(x = 'right_avd_ovlp', y = 'Gene Name'), color = 'blue', shape = '>', size = 3, position = position_nudge(y = -0.2))+
    facet_wrap('~gene + Direction',nrow = 5, scales = 'free') +theme(dpi = 200, figure_size=(5,4))
)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-29-1a194fa664c3> in <module>
      1 (
----> 2     ggplot(df.head()) +
      3     geom_segment(aes(x = 'Left-End-Position', xend = 'Right-End-Position', y = 'Gene Name', yend = 'Gene Name')) +
      4     geom_point(aes(x = 'Left-End-Position', y = 'Gene Name'), shape = '|', size = 5) +
      5     geom_point(aes(x = 'Right-End-Position', y = 'Gene Name'), shape = '|', size = 5)+

NameError: name 'ggplot' is not defined


(
    ggplot(df) + 
    geom_segment(aes(x = 'Left-End-Position', xend = 'Right-End-Position', y = 'Gene Name', yend = 'Gene Name')) + 
    geom_point(aes(x = 'Left-End-Position', y = 'Gene Name'), shape = '|', size = 5) + 
    geom_point(aes(x = 'Right-End-Position', y = 'Gene Name'), shape = '|', size = 5)+
    geom_point(aes(x = 'left_codon', y = 'Gene Name'), color = 'red', shape = '<', size = 3, position = position_nudge(y = 0.2))+
    geom_point(aes(x = 'right_codon', y = 'Gene Name'), color = 'red', shape = '>', size = 3, position = position_nudge(y = 0.2))+
    geom_point(aes(x = 'left_avd_ovlp', y = 'Gene Name'), color = 'blue', shape = '<', size = 3, position = position_nudge(y = -0.2))+
    geom_point(aes(x = 'right_avd_ovlp', y = 'Gene Name'), color = 'blue', shape = '>', size = 3, position = position_nudge(y = -0.2))+
    facet_wrap('~gene + Direction',nrow = 30, scales = 'free') +
    theme(dpi = 300, figure_size=(30,30))
)

#p.make()

# Then you can alter its properties
#p.set_size_inches(15, 5, forward=True)
#p.set_dpi(100)
#p.fig

# And display the final figure


lines = inspect.getsource(wgregseq.get_target_oligo_df)
print(lines)

def get_target_oligo_df(df, left_pos_col, right_pos_col, dir_col, genome, homology = 90, attB_fwd_seq = 'ggcttgtcgacgacggcggtctccgtcgtcaggatcat'):
    
    """
    Apply get_target_oligo to a dataframe of genomic coordinates and directions. Iterates through df rows calling get_target_oligo given the parameters specified in each column.
    
    Given a set of parameters, get an ORBIT oligo that targets the lagging strand. 
    Left and right positions are absolute genomic coordinates that specify the final nucleotides to keep unmodified in the genome, 
    everything in between will be replaced by attB. In other words the left position nucleotide is the final nt before attB in the oligo.
    The right position nt is the first nt after attB in the oligo.
    
    This function determines the lagging strand by calling `get_replichore()` on the left_pos.
    Typically attB_dir should be set to the same direction as the gene of interest, such that the integrating plasmid will insert with payload facing downstream.
    attB_fwd_seq can be modified, and the total homology can be modified, but should be an even number since homology arms are symmetric. 
        
    Parameters
    -----------------
    df : pd.DataFrame
        Pandas dataframe containing the required genomic coordinates, and gene directions.
    left_pos_col : str
        Column name of left genomic coordinate of desired attB insertion. attB is added immediately after this nt. 
    right_pos_col : str
        Column name of right genomic coordinate of desired attB insertion. attB is added immediately after this nt. 
    dir_col : str
        Column name of desired direction of attB based on genomic strand. Typically same direction as gene.
    genome : str
        Genome as a string.
    homology : int (even)
        Total homology length desired for oligo. Arm length = homology / 2.   
    attB_fwd_seq : str
        Sequence of attB to insert between homology arms.
    verbose : bool
        If true, prints details about genomic positions and replichore.
    Returns
    ---------------
    df_results : pd.DataFrame
        Adds column 'oligo' to input df. 'oligo' contains a string of the targeting oligo sequence against lagging strand, including the attB sequence in the correct orientation.
    """
    
    df_tmp = pd.DataFrame()
    df_results = pd.DataFrame()
    
    for i,row in df.iterrows():
        
        left_pos = row[left_pos_col]
        right_pos = row[right_pos_col]
        attB_dir = row[dir_col]
        
        oligo = get_target_oligo(left_pos, right_pos, genome, homology, attB_dir, attB_fwd_seq)

        df_tmp = df.iloc[[i],:]
        
        df_tmp['oligo'] = oligo
        
        df_results = pd.concat([df_results,df_tmp])
    
    return df_results


df_first_last = wgregseq.get_target_oligo_df(df, 'left_codon', 'right_codon', 'Direction',genome)
df_first_last.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)

/Users/tomroschinger/git/Reg-Seq2/software_module/wgregseq/orbit.py:347: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp['oligo'] = oligo


df_avd_ovlp = wgregseq.get_target_oligo_df(df, 'left_avd_ovlp', 'right_avd_ovlp', 'Direction',genome)
df_avd_ovlp.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)

/Users/tomroschinger/git/Reg-Seq2/software_module/wgregseq/orbit.py:347: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp['oligo'] = oligo


df_2 = df.copy()
df_2['Direction'] = '+'
df_2.head()


df_first_last_2 = wgregseq.get_target_oligo_df_2(df_2, 'left_codon', 'right_codon','Direction',genome, attB_lock = True)
df_first_last_2.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)

/Users/tomroschinger/git/Reg-Seq2/software_module/wgregseq/orbit.py:406: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp['oligo'] = oligo


df_avd_ovlp_2 = wgregseq.get_target_oligo_df_2(df_2, 'left_avd_ovlp', 'right_avd_ovlp', 'Direction',genome,attB_lock = True)
df_avd_ovlp_2.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)

/Users/tomroschinger/git/Reg-Seq2/software_module/wgregseq/orbit.py:406: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp['oligo'] = oligo


df_first_last_short_2 = df_first_last_2.loc[df_first_last_2['length']<575].reset_index()
df_first_last_short_2.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)


df_first_last_long_2 = df_first_last_2.loc[df_first_last_2['length']>=575].reset_index()
df_first_last_long_2.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)


df_avd_ovlp_short_2 = df_avd_ovlp_2.loc[df_avd_ovlp_2['length']<575].reset_index()
df_avd_ovlp_short_2.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)


df_avd_ovlp_long_2 = df_avd_ovlp_2.loc[df_avd_ovlp_2['length']>=575].reset_index()
df_avd_ovlp_long_2.drop(['Product Name', 'GO terms (molecular function)'], axis = 1)


df_first_last_short_2.to_csv("twist_orbit_tf_del_FL_short.csv")
df_first_last_long_2.to_csv("twist_orbit_tf_del_FL_long.csv")

df_avd_ovlp_short_2.to_csv("twist_orbit_tf_del_AO_short.csv")
df_avd_ovlp_long_2.to_csv("twist_orbit_tf_del_AO_long.csv")


%load_ext watermark
%watermark -v -p wgregseq,numpy,pandas

CPython 3.8.5
IPython 7.19.0

wgregseq 0.0.1
numpy 1.18.1
pandas 1.2.0

Replichore	gene dir	5' abs-rel pos	3' abs-rel pos	attB
1	+	right-down	left-up	rev
1	-	right-up	left-down	fwd
2	+	left-up	right-down	fwd
2	-	left-down	right-up	rev

	Gene Name	Left-End-Position	Right-End-Position	Direction
0	aaeR	3389520	3390449	+
1	abgR	1404741	1405649	+
2	acrR	485761	486408	+
3	ada	2309341	2310405	-
4	adiY	4337168	4337929	-
...	...	...	...	...
295	yqhC	3154262	3155218	-
296	ytfH	4434113	4434493	+
297	zntR	3438705	3439130	-
298	zraR	4203320	4204645	+
299	zur	4259488	4260003	-

	Gene Name	Left-End-Position	Right-End-Position	Direction	length
10	alpA	2758644	2758856	+	212
16	argR	3384703	3385173	+	470
17	ariR	1216369	1216635	+	266
18	arsR	3648528	3648881	+	353
20	asnC	3926545	3927003	-	458
...	...	...	...	...	...
283	yjdC	4362733	4363308	-	575
289	ylbG	529645	530016	-	371
296	ytfH	4434113	4434493	+	380
297	zntR	3438705	3439130	-	425
299	zur	4259488	4260003	-	515

	Gene Name	Left-End-Position	Right-End-Position	Direction	length
0	aaeR	3389520	3390449	+	929
1	abgR	1404741	1405649	+	908
2	acrR	485761	486408	+	647
3	ada	2309341	2310405	-	1064
4	adiY	4337168	4337929	-	761
...	...	...	...	...	...
292	ypdC	2501130	2501987	+	857
293	yphH	2682863	2684056	+	1193
294	yqeI	2988502	2989311	+	809
295	yqhC	3154262	3155218	-	956
298	zraR	4203320	4204645	+	1325

	Bnum	Name	Str	Start	Stop	Bnum.1	Name.1	Str.1	Start.1	Stop.1	Overlap	New start	Unnamed: 12
0	b0002	thrA	1	337	2799	b0003	thrB	1	2801	3733	-1	NaN	NaN
1	b0003	thrB	1	2801	3733	b0004	thrC	1	3734	5020	0	NaN	NaN
2	b0013	yaaI	2	11382	11786	b0011	NaN	2	10643	11356	-25	11384.0	NaN
3	b0022	insA_1	2	20233	20508	b0021	insB_1	2	19811	20314	82	20342.0	NaN
4	b0024	NaN	1	21181	21399	b0025	yaaC	1	21407	22348	-7	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...
1499	b4380	yjjI	2	4613084	4614634	b4379	yjjW	2	4612249	4613112	29	4613140.0	NaN
1500	b4389	sms	1	4623481	4624863	b4390	nadR	1	4624863	4626116	1	NaN	NaN
1501	b4397	creA	1	4633090	4633563	b4398	creB	1	4633576	4634265	-12	NaN	NaN
1502	b4398	creB	1	4633576	4634265	b4399	creC	1	4634265	4635689	1	NaN	NaN
1503	b4405	NaN	1	3975603	3976217	b3793	rffT	1	3976214	3977566	4	NaN	NaN

TWIST: ORBIT TF Deletion¶

`get_target_oligo()`¶

Complementarity issue¶

TF gene import¶

Gene length considerations¶

Gene overlap considerations¶

`get_target_oligo_df()`¶

Modify to avoid complementarity¶

Final QC and output¶

Computational Environment¶

	Gene Name	Product Name	GO terms (molecular function)	Left-End-Position	Right-End-Position	Direction	length	left_codon	right_codon	left_avd_ovlp	right_avd_ovlp	gene
0	aaeR	LysR-type transcriptional regulator AaeR	transcription regulatory region sequence-specific DNA binding // DNA-binding transcription factor activity // DNA binding	3389520	3390449	+	929	3389522	3390447	3389525.0	3390429.0	aaeR
1	abgR	putative LysR-type DNA-binding transcriptional regulator AbgR	transcription regulatory region sequence-specific DNA binding // DNA binding // DNA-binding transcription factor activity	1404741	1405649	+	908	1404743	1405647	1404746.0	1405629.0	abgR
2	acrR	DNA-binding transcriptional repressor AcrR	transcription regulatory region sequence-specific DNA binding // protein binding // DNA binding // bacterial-type RNA polymerase transcription regulatory region sequence-specific DNA binding // to...	485761	486408	+	647	485763	486406	485766.0	486388.0	acrR
3	ada	DNA-binding transcriptional dual regulator / DNA repair protein Ada	protein binding // transferase activity // methyltransferase activity // metal ion binding // sequence-specific DNA binding // zinc ion binding // catalytic activity // DNA binding // DNA-binding ...	2309341	2310405	+	1064	2309343	2310403	2309361.0	2310400.0	ada
4	adiY	DNA-binding transcriptional activator AdiY	sequence-specific DNA binding // DNA binding // DNA-binding transcription factor activity	4337168	4337929	+	761	4337170	4337927	4337188.0	4337924.0	adiY

TWIST: ORBIT TF Deletion¶

get_target_oligo()¶

Complementarity issue¶

TF gene import¶

Gene length considerations¶

Gene overlap considerations¶

get_target_oligo_df()¶

Modify to avoid complementarity¶

Final QC and output¶

Computational Environment¶

`get_target_oligo()`¶

`get_target_oligo_df()`¶