Source code for idpconfgen.libs.libfilter

"""Contain functions to filter information from the DB."""
import itertools as it
import re
from functools import partial

import numpy as np

from idpconfgen import log
from idpconfgen.core.definitions import (
    bgeo_CaC,
    bgeo_CaCNp1,
    bgeo_CaCO,
    bgeo_Cm1NCa,
    bgeo_CNp1,
    bgeo_CO,
    bgeo_NCa,
    bgeo_NCaC,
    )
from idpconfgen.libs.libmulticore import pool_function
from idpconfgen.libs.libparse import make_list_if_not


REGEX_OVERLAP = re.compile(r'\(\?\=\(.+\)')
# read comments bellow
REGEX_RANGE = re.compile(r'(\{\d+\,\d+\}|\{\d+\}|\{\d+\,\}|\{\,\d+\})')
# a more general version of the above is: r'\{\d*\,*\d*\}' but this would
# accept also r'{}' which is not desired
# also we consider \{\d+\} for simplicity of the algorithm
REGEX_RANGE_CHAR = re.compile(r'((\w)\{|\[([\w ]+)\]\{)')


[docs]def make_overlap_regex(s, range_):
    """Make an overlap regex."""
    i, j = range_
    if any(_ < 1 for _ in range_):
        raise ValueError(f"Range must be positive: {range_!r}")
    if j < i:
        raise ValueError(f"End must be higher than start: {range_!r}")
    # (?=([LHE]{1,5})), for example.
    return r"(?=([" + s + r"]{" + str(i) + "," + str(j) + r"}))"


make_loop_overlap_regex = partial(make_overlap_regex, "L")
make_helix_overlap_regex = partial(make_overlap_regex, "H")
make_strand_overlap_regex = partial(make_overlap_regex, "E")
make_any_overlap_regex = partial(make_overlap_regex, "LHE")


[docs]def aligndb(db, exact=False):
    """Aligns IDPConfGen DB."""
    NAN = np.nan
    phi, psi, omg, dssp, resseq = [], [], [], [], []
    pdbs = {}
    PSIE = psi.extend
    PHIE = phi.extend
    OMGE = omg.extend
    DA = dssp.append
    RA = resseq.append

    if exact:
        cm1nca, ncac, cacnp1, caco = [], [], [], []
        nca, cac, cnp1, co = [], [], [], []
        CM1NCAE = cm1nca.extend
        NCACE = ncac.extend
        CACNP1E = cacnp1.extend
        CACOE = caco.extend
        NCAE = nca.extend
        CACE = cac.extend
        CNP1E = cnp1.extend
        COE = co.extend

    # +1 because NAN are added as spacers
    spacer = 1  # algorithm definition

    current = 0
    for pdb, data in db.items():

        # the first and the last residues are discarded because these
        # residues lack the information for the three angles. The first
        # residue lack information on the Omega and Phi angles, because
        # Ca-1--C-1--N--Ca does not exist and C-1--N--Ca--C does not
        # exist also.
        # Likewise, the last residue lacks information for the Psi
        # angle. Hence, the first and last residues for each protein are
        # discarded.
        fasta_truncated = data['fasta'][1:-1]
        dssp_truncated = data['dssp'][1:-1]

        # As described in
        # http://dunbrack.fccc.edu/bbdep2010/Tutorial.php
        # he phi dihedral angle for residue i is defined by
        # Ci-1-Ni-Cαi-Ci; the psi dihedral angle for residue i is defined
        # by Ni-Cαi-Ci-Ni+1; the omega dihedral angle for residue i is
        # defined by Cαi-1-Ci-1-Ni-Cαi.
        # The last omega and phi are discarded.
        # The first psi in the list is discarded.
        # Example:
        # in a 6 residue protein, first and last residues are discarded
        # same of the DSSP information associated
        # M  QWET  Y
        # L  EEEE  L
        #
        # S -> psi
        # O -> omega
        # H -> phi
        #
        # letters identify angles in the backbone
        #
        #     S O H  S O H
        # N-CA-C-N-CA-C-N-CA-C
        omg_truncated = data['omega'][:-1]
        phi_truncated = data['phi'][:-1]
        psi_truncated = data['psi'][1:]

        len_segment = len(fasta_truncated)

        if exact:
            _cm1nca = data[bgeo_Cm1NCa]
            _ncac = data[bgeo_NCaC]
            _cacnp1 = data[bgeo_CaCNp1]
            _caco = data[bgeo_CaCO]

            _nca = data[bgeo_NCa]
            _cac = data[bgeo_CaC]
            _cnp1 = data[bgeo_CNp1]
            _co = data[bgeo_CO]

            lists_to_compare = [
                dssp_truncated,
                phi_truncated,
                psi_truncated,
                omg_truncated,
                _cm1nca,
                _ncac,
                _cacnp1,
                _caco,
                _nca,
                _cac,
                _cnp1,
                _co,
                ]

        else:
            lists_to_compare = [
                dssp_truncated,
                phi_truncated,
                psi_truncated,
                omg_truncated,
                ]

        if any(len(i) != len_segment for i in lists_to_compare):
            log.debug(
                'number of residues, SS chars and angles do not match, '
                f'ignoring... {pdb}'
                )
            continue

        phi_truncated.append(NAN)
        psi_truncated.append(NAN)
        omg_truncated.append(NAN)

        if exact:
            _cm1nca.append(NAN)
            _ncac.append(NAN)
            _cacnp1.append(NAN)
            _caco.append(NAN)

            _nca.append(NAN)
            _cac.append(NAN)
            _cnp1.append(NAN)
            _co.append(NAN)

        pdbs[pdb] = slice(current, current + len_segment)
        # +1 because resseq will be concatenated with '|'
        # can't avoid +1 because of the need to set the next starting integer
        current += len_segment + spacer

        PHIE(phi_truncated)
        PSIE(psi_truncated)
        OMGE(omg_truncated)

        DA(dssp_truncated)
        RA(fasta_truncated)

        if exact:
            CM1NCAE(_cm1nca)
            NCACE(_ncac)
            CACNP1E(_cacnp1)
            CACOE(_caco)

            NCAE(_nca)
            CACE(_cac)
            CNP1E(_cnp1)
            COE(_co)

    _resseq = '|'.join(resseq)
    _dssp = '|'.join(dssp)
    _angles = np.array((omg, phi, psi), dtype=np.float32).T

    if exact:
        _bend_angs = np.array((cm1nca, ncac, cacnp1, caco), dtype=np.float32).T
        _bond_lens = np.array((nca, cac, cnp1, co), dtype=np.float32).T

        return pdbs, _angles, _bend_angs, _bond_lens, _dssp, _resseq

    return pdbs, _angles, _dssp, _resseq

# # regex to compute
# forward with overlap
# forward no overlap


[docs]def regex_search(sequence, regex_string, rex_range=REGEX_RANGE, **kwargs):
    """
    Search for regex in sequence.

    Parameters
    ----------
    sequence : str
        The sequence where to apply the regex.

    regex_string : str
        The regex to search for.

    regex_range : compiled regex
        A regex to apply in `regex_string` to identify if `regex_string`
        refers to a range.

    Return
    ------
    list of slices
    """
    assert isinstance(sequence, str)
    assert isinstance(regex_string, str), type(regex_string)
    # if a range exists in regex_string
    # range is defined by default by: L{1}, L{1,5} situations
    # the following functions ensures searchs goes both directions
    if rex_range.findall(regex_string):
        result = regex_range(sequence, regex_string, **kwargs)

    else:
        func = regex_forward_with_overlap \
            if regex_has_overlap(regex_string) \
            else regex_forward_no_overlap
        result = func(sequence, regex_string)

    assert isinstance(result, list)
    assert all(isinstance(S, slice) for S in result)  # heavy and slow!!
    return result


[docs]def regex_range(sequence, regex_string, ncores=1):
    """
    Find slices of sequence where regex_string applies.

    Searches forward and backwards by means of exploring ranges
    as individual regexes.
    """
    # prefix and suffix regex for overlap
    if regex_has_overlap(regex_string):
        pre, suf, func = r'(?=(', r'))', regex_forward_with_overlap
    else:
        pre, suf, func = '', '', regex_forward_no_overlap

    # creates all possible regex combinations without ranges
    # from the rangex identified in the original regex_string
    regex_combinations = make_regex_combinations_from_ranges(
        regex_string,
        pre=pre,
        suf=suf,
        )

    exec_pool = pool_function(
        partial(func, sequence),
        regex_combinations,
        ncores=ncores,
        )

    slices = []
    for result in exec_pool:
        slices.extend(result)

    return slices


[docs]def regex_has_overlap(regex_string, overlap_rex=REGEX_OVERLAP):
    """
    Find if a `regex_string` defines overlap.

    Parameters
    ----------
    regex_string : str
        The regex string.

    overlap_fmt : str
        The regex to find overlap in regex_string.

    Returns
    -------
    bool
    """
    return bool(overlap_rex.findall(regex_string))


[docs]def make_regex_combinations_from_ranges(regex_string, **kwargs):
    """."""
    ranges, chars = make_ranges(regex_string)
    return make_regex_combinations(ranges, chars, **kwargs)


[docs]def make_ranges(
        regex_string,
        rang_rex=REGEX_RANGE,
        char_rex=REGEX_RANGE_CHAR,
        max_range=30,
        ):
    """
    Define a set of ranges and characters from `regex_string`.

    Examples
    --------
        >>> make_range('L{1}H{1,2}')
        [range(1, 2), range(1, 3)], ['L', 'H']

        >>> make_range('L{1,}H{,2}')
        [range(1, None), range(1, 3)], ['L', 'H']
    """
    # requires
    assert isinstance(regex_string, str)

    # examples of i according to `prev`
    # 'L{', 'H{'
    chars = []
    for i in char_rex.findall(regex_string):
        chars.append(i[1] or i[2])

    # yes, I tried to write this in a list comprehension, but these are
    # 3 for loops. I thought is just more readable to make it explicit
    rangs = (i.strip('{}') for i in rang_rex.findall(regex_string))
    ranges = []
    for trange in rangs:
        # examples of trange:
        # '1', '1,', '1,2', ',2'
        # to make ranges compatible with regex, start should be equal to 1
        # and because regex ranges are inclusive, 1 must be added to stop
        ts = trange.split(',')

        # 'or 1' covers {,3} cases, that yield ['', '3']
        start = int(ts[0] or 1)

        try:
            # 'or max_range' covers {3,} cases, that yield ['3', '']
            end = int(ts[1] or max_range)
        except IndexError:
            # covers {1} no-range situations
            end = start

        ranges.append(range(start, end + 1))

    # ensures
    assert isinstance(ranges, list)
    assert isinstance(chars, list)
    assert len(ranges) == len(chars), (len(ranges), len(chars))
    return ranges, chars


[docs]def make_regex_combinations(ranges, chars, pre=None, suf=None):
    """
    Make combinations of regexes from `ranges` and `chars`.

    This function is not a general abstraction. Is instead a partial
    abstraction within the problem of IDPConfGen.

    Parameters
    ----------
    ranges : list of range objects
        Ranges where to start and stop searching in the regex.
        Ranges should follow regex conventions, usually 1-indexed
        and stop inclusive.

    chars : list of 1 letter chars
        The chars that will be searched in ranges.

    pre, suf : str
        Strings to add as prefix and suffix of the generates ranges.
    """
    pre = pre or ''
    suf = suf or ''
    regex_combinations = []

    def make_group(c):
        if len(c) > 1:
            return f"[{c}]"
        return c

    chars = make_list_if_not(chars)
    for range_tuple in it.product(*ranges):
        c_regex = (
            make_group(c) + '{' + str(ii) + '}'
            for ii, c in zip(range_tuple, chars)
            )
        regex_combinations.append(f"{pre}{''.join(c_regex)}{suf}")
    return regex_combinations


[docs]def regex_forward_no_overlap(sequence, regex):
    r"""
    Search for regex forward without overlap.

    Examples
    --------

        r'L'
        r'L{2}'
        r'L{1,3}'

    In the first example, returns all indexes of single char L without
        overlap.

    On the second example, returns all indexes of entire 'LL' sequences
        without overlap. So, 'LLL' returns only slice(0, 2).

    3) returns all sequences with 'LLL' without overlap, if a terminal 'LL'
        is found, returns that. Same for 'L' if found at the end.

    Using expressions such as r'(?=(L))' give not correct results.
    Use `regex_forward_with_overlap` instead.
    """
    # this function is not used currently
    # adding an assert here to cause an error in case it is used
    # unwanted
    assert '(?=' not in regex

    # m.span() is used for regexes without overlap
    # using m.start(1) would not work here.
    # See regex_forward_with_overlap
    regex_c = re.compile(regex)
    return [slice(*m.span()) for m in regex_c.finditer(sequence)]


[docs]def regex_forward_with_overlap(sequence, regex):
    """
    Find matches for regex in sequence of chars.

    Considers regex defines overlap.

    Accepts only regex expressions with overlap, for example:

        r'(?=(L{3}))'

    Returns
    -------
    list of slices
        Where slice.start and slice.stop follow Python conventions.
    """
    regex_c = re.compile(regex)
    return [slice(m.start(1), m.end(1)) for m in regex_c.finditer(sequence)]