Source code for idpconfgen.libs.libfilter

"""Contain functions to filter information from the DB."""
import itertools as it
import re
from functools import partial

import numpy as np

from idpconfgen import log
from idpconfgen.core.definitions import (
    bgeo_CaC,
    bgeo_CaCNp1,
    bgeo_CaCO,
    bgeo_Cm1NCa,
    bgeo_CNp1,
    bgeo_CO,
    bgeo_NCa,
    bgeo_NCaC,
    )
from idpconfgen.libs.libmulticore import pool_function
from idpconfgen.libs.libparse import make_list_if_not


REGEX_OVERLAP = re.compile(r'\(\?\=\(.+\)')
# read comments bellow
REGEX_RANGE = re.compile(r'(\{\d+\,\d+\}|\{\d+\}|\{\d+\,\}|\{\,\d+\})')
# a more general version of the above is: r'\{\d*\,*\d*\}' but this would
# accept also r'{}' which is not desired
# also we consider \{\d+\} for simplicity of the algorithm
REGEX_RANGE_CHAR = re.compile(r'((\w)\{|\[([\w ]+)\]\{)')


[docs]def make_overlap_regex(s, range_): """Make an overlap regex.""" i, j = range_ if any(_ < 1 for _ in range_): raise ValueError(f"Range must be positive: {range_!r}") if j < i: raise ValueError(f"End must be higher than start: {range_!r}") # (?=([LHE]{1,5})), for example. return r"(?=([" + s + r"]{" + str(i) + "," + str(j) + r"}))"
make_loop_overlap_regex = partial(make_overlap_regex, "L") make_helix_overlap_regex = partial(make_overlap_regex, "H") make_strand_overlap_regex = partial(make_overlap_regex, "E") make_any_overlap_regex = partial(make_overlap_regex, "LHE")
[docs]def aligndb(db, exact=False): """Aligns IDPConfGen DB.""" NAN = np.nan phi, psi, omg, dssp, resseq = [], [], [], [], [] pdbs = {} PSIE = psi.extend PHIE = phi.extend OMGE = omg.extend DA = dssp.append RA = resseq.append if exact: cm1nca, ncac, cacnp1, caco = [], [], [], [] nca, cac, cnp1, co = [], [], [], [] CM1NCAE = cm1nca.extend NCACE = ncac.extend CACNP1E = cacnp1.extend CACOE = caco.extend NCAE = nca.extend CACE = cac.extend CNP1E = cnp1.extend COE = co.extend # +1 because NAN are added as spacers spacer = 1 # algorithm definition current = 0 for pdb, data in db.items(): # the first and the last residues are discarded because these # residues lack the information for the three angles. The first # residue lack information on the Omega and Phi angles, because # Ca-1--C-1--N--Ca does not exist and C-1--N--Ca--C does not # exist also. # Likewise, the last residue lacks information for the Psi # angle. Hence, the first and last residues for each protein are # discarded. fasta_truncated = data['fasta'][1:-1] dssp_truncated = data['dssp'][1:-1] # As described in # http://dunbrack.fccc.edu/bbdep2010/Tutorial.php # he phi dihedral angle for residue i is defined by # Ci-1-Ni-Cαi-Ci; the psi dihedral angle for residue i is defined # by Ni-Cαi-Ci-Ni+1; the omega dihedral angle for residue i is # defined by Cαi-1-Ci-1-Ni-Cαi. # The last omega and phi are discarded. # The first psi in the list is discarded. # Example: # in a 6 residue protein, first and last residues are discarded # same of the DSSP information associated # M QWET Y # L EEEE L # # S -> psi # O -> omega # H -> phi # # letters identify angles in the backbone # # S O H S O H # N-CA-C-N-CA-C-N-CA-C omg_truncated = data['omega'][:-1] phi_truncated = data['phi'][:-1] psi_truncated = data['psi'][1:] len_segment = len(fasta_truncated) if exact: _cm1nca = data[bgeo_Cm1NCa] _ncac = data[bgeo_NCaC] _cacnp1 = data[bgeo_CaCNp1] _caco = data[bgeo_CaCO] _nca = data[bgeo_NCa] _cac = data[bgeo_CaC] _cnp1 = data[bgeo_CNp1] _co = data[bgeo_CO] lists_to_compare = [ dssp_truncated, phi_truncated, psi_truncated, omg_truncated, _cm1nca, _ncac, _cacnp1, _caco, _nca, _cac, _cnp1, _co, ] else: lists_to_compare = [ dssp_truncated, phi_truncated, psi_truncated, omg_truncated, ] if any(len(i) != len_segment for i in lists_to_compare): log.debug( 'number of residues, SS chars and angles do not match, ' f'ignoring... {pdb}' ) continue phi_truncated.append(NAN) psi_truncated.append(NAN) omg_truncated.append(NAN) if exact: _cm1nca.append(NAN) _ncac.append(NAN) _cacnp1.append(NAN) _caco.append(NAN) _nca.append(NAN) _cac.append(NAN) _cnp1.append(NAN) _co.append(NAN) pdbs[pdb] = slice(current, current + len_segment) # +1 because resseq will be concatenated with '|' # can't avoid +1 because of the need to set the next starting integer current += len_segment + spacer PHIE(phi_truncated) PSIE(psi_truncated) OMGE(omg_truncated) DA(dssp_truncated) RA(fasta_truncated) if exact: CM1NCAE(_cm1nca) NCACE(_ncac) CACNP1E(_cacnp1) CACOE(_caco) NCAE(_nca) CACE(_cac) CNP1E(_cnp1) COE(_co) _resseq = '|'.join(resseq) _dssp = '|'.join(dssp) _angles = np.array((omg, phi, psi), dtype=np.float32).T if exact: _bend_angs = np.array((cm1nca, ncac, cacnp1, caco), dtype=np.float32).T _bond_lens = np.array((nca, cac, cnp1, co), dtype=np.float32).T return pdbs, _angles, _bend_angs, _bond_lens, _dssp, _resseq return pdbs, _angles, _dssp, _resseq
# # regex to compute # forward with overlap # forward no overlap
[docs]def regex_range(sequence, regex_string, ncores=1): """ Find slices of sequence where regex_string applies. Searches forward and backwards by means of exploring ranges as individual regexes. """ # prefix and suffix regex for overlap if regex_has_overlap(regex_string): pre, suf, func = r'(?=(', r'))', regex_forward_with_overlap else: pre, suf, func = '', '', regex_forward_no_overlap # creates all possible regex combinations without ranges # from the rangex identified in the original regex_string regex_combinations = make_regex_combinations_from_ranges( regex_string, pre=pre, suf=suf, ) exec_pool = pool_function( partial(func, sequence), regex_combinations, ncores=ncores, ) slices = [] for result in exec_pool: slices.extend(result) return slices
[docs]def regex_has_overlap(regex_string, overlap_rex=REGEX_OVERLAP): """ Find if a `regex_string` defines overlap. Parameters ---------- regex_string : str The regex string. overlap_fmt : str The regex to find overlap in regex_string. Returns ------- bool """ return bool(overlap_rex.findall(regex_string))
[docs]def make_regex_combinations_from_ranges(regex_string, **kwargs): """.""" ranges, chars = make_ranges(regex_string) return make_regex_combinations(ranges, chars, **kwargs)
[docs]def make_ranges( regex_string, rang_rex=REGEX_RANGE, char_rex=REGEX_RANGE_CHAR, max_range=30, ): """ Define a set of ranges and characters from `regex_string`. Examples -------- >>> make_range('L{1}H{1,2}') [range(1, 2), range(1, 3)], ['L', 'H'] >>> make_range('L{1,}H{,2}') [range(1, None), range(1, 3)], ['L', 'H'] """ # requires assert isinstance(regex_string, str) # examples of i according to `prev` # 'L{', 'H{' chars = [] for i in char_rex.findall(regex_string): chars.append(i[1] or i[2]) # yes, I tried to write this in a list comprehension, but these are # 3 for loops. I thought is just more readable to make it explicit rangs = (i.strip('{}') for i in rang_rex.findall(regex_string)) ranges = [] for trange in rangs: # examples of trange: # '1', '1,', '1,2', ',2' # to make ranges compatible with regex, start should be equal to 1 # and because regex ranges are inclusive, 1 must be added to stop ts = trange.split(',') # 'or 1' covers {,3} cases, that yield ['', '3'] start = int(ts[0] or 1) try: # 'or max_range' covers {3,} cases, that yield ['3', ''] end = int(ts[1] or max_range) except IndexError: # covers {1} no-range situations end = start ranges.append(range(start, end + 1)) # ensures assert isinstance(ranges, list) assert isinstance(chars, list) assert len(ranges) == len(chars), (len(ranges), len(chars)) return ranges, chars
[docs]def make_regex_combinations(ranges, chars, pre=None, suf=None): """ Make combinations of regexes from `ranges` and `chars`. This function is not a general abstraction. Is instead a partial abstraction within the problem of IDPConfGen. Parameters ---------- ranges : list of range objects Ranges where to start and stop searching in the regex. Ranges should follow regex conventions, usually 1-indexed and stop inclusive. chars : list of 1 letter chars The chars that will be searched in ranges. pre, suf : str Strings to add as prefix and suffix of the generates ranges. """ pre = pre or '' suf = suf or '' regex_combinations = [] def make_group(c): if len(c) > 1: return f"[{c}]" return c chars = make_list_if_not(chars) for range_tuple in it.product(*ranges): c_regex = ( make_group(c) + '{' + str(ii) + '}' for ii, c in zip(range_tuple, chars) ) regex_combinations.append(f"{pre}{''.join(c_regex)}{suf}") return regex_combinations
[docs]def regex_forward_no_overlap(sequence, regex): r""" Search for regex forward without overlap. Examples -------- r'L' r'L{2}' r'L{1,3}' In the first example, returns all indexes of single char L without overlap. On the second example, returns all indexes of entire 'LL' sequences without overlap. So, 'LLL' returns only slice(0, 2). 3) returns all sequences with 'LLL' without overlap, if a terminal 'LL' is found, returns that. Same for 'L' if found at the end. Using expressions such as r'(?=(L))' give not correct results. Use `regex_forward_with_overlap` instead. """ # this function is not used currently # adding an assert here to cause an error in case it is used # unwanted assert '(?=' not in regex # m.span() is used for regexes without overlap # using m.start(1) would not work here. # See regex_forward_with_overlap regex_c = re.compile(regex) return [slice(*m.span()) for m in regex_c.finditer(sequence)]
[docs]def regex_forward_with_overlap(sequence, regex): """ Find matches for regex in sequence of chars. Considers regex defines overlap. Accepts only regex expressions with overlap, for example: r'(?=(L{3}))' Returns ------- list of slices Where slice.start and slice.stop follow Python conventions. """ regex_c = re.compile(regex) return [slice(m.start(1), m.end(1)) for m in regex_c.finditer(sequence)]