[docs]defoverlapping_windows(sequence,L):""" Returns overlapping windows of size `L` from sequence `sequence` :param sequence: the nucleotide or protein sequence to scan over :param L: the length of the windows to yield """windows=[]forindex,residueinenumerate(sequence):if(index+L)<(len(sequence)+1):window=sequence[index:L+index]windows.append(window)returnwindows
[docs]defcompute_rep_vector(sequence,N):""" Computes the repetition vector (as seen in Wooton, 1993) from a given sequence of a biopolymer with `N` possible residues. :param sequence: the nucleotide or protein sequence to generate a repetition vector for. :param N: the total number of possible residues in the biopolymer `sequence` belongs to. """encountered_residues=set()repvec=[]forresidueinsequence:ifresiduenotinencountered_residues:residue_count=sequence.count(residue)repvec.append(residue_count)encountered_residues.add(residue)iflen(encountered_residues)==N:breakwhilelen(repvec)<N:repvec.append(0)returnsorted(repvec,reverse=True)
[docs]defcomplexity(sequence,N):""" Computes the Shannon Entropy of a given sequence of a biopolymer with `N` possible residues. See (Wooton, 1993) for more. :param sequence: the nucleotide or protein sequence whose Shannon Entropy is to calculated. :param N: the total number of possible residues in the biopolymer `sequence` belongs to. """repvec=compute_rep_vector(sequence,N)L=len(sequence)entropy=sum([-1*(n/L)*log((n/L),N)forninrepvecifn!=0])returnentropy
[docs]defmask_low_complexity(seq_rec,maskchar="N",N=20,L=12):""" Masks low-complexity nucleic/amino acid sequences with a given mask character. :param seq_rec: a string :param maskchar: Character to mask low-complexity residues with. :param N: Number of residues to expect in the sequence. (20 for AA, 4 for DNA) :param L: Length of sliding window that reads the sequence. """windows=overlapping_windows(seq_rec,L)rep_vectors=[(window,compute_rep_vector(window,N))forwindowinwindows]window_complexity_pairs=[(rep_vector[0],complexity(rep_vector[1],N))forrep_vectorinrep_vectors]complexities=np.array([complexity(rep_vector[1],N)forrep_vectorinrep_vectors])avg_complexity=complexities.mean()std_complexity=complexities.std()k1_cutoff=min([avg_complexity+std_complexity,avg_complexity-std_complexity])alignment=[[]foriinrange(0,len(seq_rec))]forwindow_offset,window_complexity_pairinenumerate(window_complexity_pairs):ifwindow_complexity_pair[1]<k1_cutoff:window="".join([maskcharforiinrange(0,L)])else:window=window_complexity_pair[0]forresidue_offset,residueinenumerate(window):i=window_offset+residue_offsetalignment[i].append(residue)new_seq=[]forresidue_arrayinalignment:ifresidue_array.count(maskchar)>3:new_seq.append(maskchar)else:new_seq.append(residue_array[0])new_seq="".join(new_seq)# avoid that there are isolated N'sforiinrange(1,len(new_seq)-1):ifnew_seq[i]=='N'andnew_seq[i]!='N'andnew_seq[i]!='N':new_seq[i]=seq_rec[i]returnnew_seq