Source code for malva.complexity

from math import log
import numpy as np

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


[docs] def overlapping_windows(sequence, L): """ Returns overlapping windows of size `L` from sequence `sequence` :param sequence: the nucleotide or protein sequence to scan over :param L: the length of the windows to yield """ windows = [] for index, residue in enumerate(sequence): if (index + L) < (len(sequence) + 1): window = sequence[index:L+index] windows.append(window) return windows
[docs] def compute_rep_vector(sequence, N): """ Computes the repetition vector (as seen in Wooton, 1993) from a given sequence of a biopolymer with `N` possible residues. :param sequence: the nucleotide or protein sequence to generate a repetition vector for. :param N: the total number of possible residues in the biopolymer `sequence` belongs to. """ encountered_residues = set() repvec = [] for residue in sequence: if residue not in encountered_residues: residue_count = sequence.count(residue) repvec.append(residue_count) encountered_residues.add(residue) if len(encountered_residues) == N: break while len(repvec) < N: repvec.append(0) return sorted(repvec, reverse=True)
[docs] def complexity(sequence, N): """ Computes the Shannon Entropy of a given sequence of a biopolymer with `N` possible residues. See (Wooton, 1993) for more. :param sequence: the nucleotide or protein sequence whose Shannon Entropy is to calculated. :param N: the total number of possible residues in the biopolymer `sequence` belongs to. """ repvec = compute_rep_vector(sequence, N) L = len(sequence) entropy = sum([-1*(n/L)*log((n/L), N) for n in repvec if n != 0]) return entropy
[docs] def mask_low_complexity(seq_rec, maskchar="N", N=20, L=12): """ Masks low-complexity nucleic/amino acid sequences with a given mask character. :param seq_rec: a string :param maskchar: Character to mask low-complexity residues with. :param N: Number of residues to expect in the sequence. (20 for AA, 4 for DNA) :param L: Length of sliding window that reads the sequence. """ windows = overlapping_windows(seq_rec, L) rep_vectors = [(window, compute_rep_vector(window, N)) for window in windows] window_complexity_pairs = [(rep_vector[0], complexity(rep_vector[1], N)) for rep_vector in rep_vectors] complexities = np.array([complexity(rep_vector[1], N) for rep_vector in rep_vectors]) avg_complexity = complexities.mean() std_complexity = complexities.std() k1_cutoff = min([avg_complexity + std_complexity, avg_complexity - std_complexity]) alignment = [[] for i in range(0, len(seq_rec))] for window_offset, window_complexity_pair in enumerate(window_complexity_pairs): if window_complexity_pair[1] < k1_cutoff: window = "".join([maskchar for i in range(0, L)]) else: window = window_complexity_pair[0] for residue_offset, residue in enumerate(window): i = window_offset+residue_offset alignment[i].append(residue) new_seq = [] for residue_array in alignment: if residue_array.count(maskchar) > 3: new_seq.append(maskchar) else: new_seq.append(residue_array[0]) new_seq = "".join(new_seq) # avoid that there are isolated N's for i in range(1, len(new_seq)-1): if new_seq[i] == 'N' and new_seq[i] != 'N' and new_seq[i] != 'N': new_seq[i] = seq_rec[i] return new_seq