Source code for idpconfgen.libs.libbuild

"""Tools for conformer building operations."""
import re
from collections import Counter, defaultdict, namedtuple
from functools import partial
from itertools import cycle

import numpy as np
from numba import njit

# import idpcpp, imported locally at init_faspr_sidechains
from idpconfgen import log
from idpconfgen.core.build_definitions import (
    backbone_atoms,
    bonds_equal_3_inter,
    bonds_le_2_inter,
    distances_C_Np1,
    distances_CA_C,
    distances_N_CA,
    )
from idpconfgen.core.definitions import bgeo_CaCNp1, bgeo_Cm1NCa, bgeo_NCaC
from idpconfgen.libs.libcalc import (
    calc_all_vs_all_dists_njit,
    multiply_upper_diagonal_raw_njit,
    sum_upper_diagonal_raw_njit,
    )
from idpconfgen.libs.libenergyij import (
    default_post_calc_option,
    energycalculator_ij,
    init_coulomb_calculator,
    init_lennard_jones_calculator,
    )
from idpconfgen.libs.libfilter import regex_forward_with_overlap
from idpconfgen.libs.libfunc import make_iterable
from idpconfgen.libs.libmulticore import pool_function
from idpconfgen.libs.libparse import get_mers, translate_seq_to_3l
from idpconfgen.logger import S


# See TODO at init_faspr_sidechains
# Variables related to the sidechain building process.
# # Variables for the FASPR algorithm.
# faspr_sc = idpcpp.faspr_sidechains
# faspr_dun2010_bbdep_str = str(faspr_dun2010bbdep_path)


ConfMasks = namedtuple(
    'ConfMaks',
    [
        'all',
        'bb3',
        'bb4',
        'NHs',
        'COs',
        'Hterm',
        'OXT1',
        'OXT2',
        'cterm',
        'non_Hs',
        'non_Hs_non_OXT',
        'non_NHs_non_OXT',
        'non_sidechains',  # carbon beta included
        'all_sidechains',  # carbon beta included
        'H2_N_CA_CB',
        ]
    )


ConfLabels = namedtuple(
    'ConfLabels',
    [
        'atom_labels',
        'res_nums',
        'res_labels',
        ]
    )
"""
Contain label information for a protein/conformer.

Attributes
----------
atom_labels : np.array
res_nums : np.array
res_labels : np.array
"""


[docs]def build_regex_substitutions(
        s,  # str
        options,  # dict
        pre_treatment=list,
        post_treatment=''.join,
        ):  # -> str
    """
    Build character replacements in regex string.

    Example
    -------
    >>> build_regex_substitutions('ASD', {'S': 'SE'})
    'A[SE]D'

    >>> build_regex_substitutions('ASDS', {'S': 'SE'})
    'A[SE]D[SE]'

    >>> build_regex_substitutions('ASDS', {})
    'ASDS'

    Parameters
    ----------
    s : regex string

    options : dict
        Dictionary of char to multichar substitutions

    pre_treatment : callable, optional
        A treatment to apply in `s` before substitution.
        `pre_treatment` must return a list-like object.
        Default: list because it expects s to be a string.

    post_treatment : callable, optional
        A function to apply on the resulting list-like object
        before returning.
        Default: ''.join, to return a string.
    """
    s_treat = pre_treatment(s)
    for i, char in enumerate(s_treat):
        if char in options:
            s_treat[i] = f'[{options[char]}]'

    return post_treatment(s_treat)


[docs]def init_confmasks(atom_labels):
    """
    Create a ConfMask object (namedtuple).

    ConfMask is a named tuple which attributes are integer masks for the
    respective groups.

    Parameters
    ----------
    atom_labels : array-like
        The atom names of the protein.

    Returns
    -------
    namedtuple
        ConfMasks object.

    Notes
    -----
    ConfMask attributes map to the following atom groups::

        bb3 : N, CA, C
        bb4 : N, CA, C, O
        NHs : amide protons
        Hterm : N-terminal protons
        OXT1 : O atom of C-terminal carboxyl group
        OXT2 : OXT atom of the C-terminal carboxyl group
        cterm : (OXT2, OXT1)
        non_Hs : all but hydrogens
        non_Hs_non_OXT : all but hydrogens and the only OXT atom
        non_NHs_non_OXT : all but NHs and OXT atom
        H2_N_CA_CB : these four atoms from the first residue
                     if Gly, uses HA3.
        non_sidechains : all atoms except sidechains beyond CB
        all_sidechain : all sidechain atoms including CB and HA
    """
    _all = np.ones(len(atom_labels), dtype=bool)

    bb3 = np.where(np.isin(atom_labels, ('N', 'CA', 'C')))[0]
    assert len(bb3) % 3 == 0

    bb4 = np.where(np.isin(atom_labels, ('N', 'CA', 'C', 'O')))[0]
    assert len(bb4) % 4 == 0

    NHs = np.where(atom_labels == 'H')[0]

    # last O belongs to the C-term carboxyl, we don't want it in the carbonyl
    # mask
    COs = np.where(atom_labels == 'O')[0][:-1]

    OXT1 = np.where(atom_labels == 'O')[0][-1]
    # the actual OXT atom, this is only one O of the C-term carboxyl pair
    OXT2 = np.where(atom_labels == 'OXT')[0][0]  # int instead of list

    cterm = [OXT2, OXT1]

    rr = re.compile(r'H+')
    hs_match = np.vectorize(lambda x: bool(rr.match(x)))
    non_Hs = np.where(np.logical_not(hs_match(atom_labels)))[0]
    non_Hs_non_OXT = non_Hs[:-1]

    non_NHs_non_OXT = np.where(np.logical_not(np.isin(atom_labels, ('H', 'OXT'))))[0]  # noqa E:501
    _s = set(atom_labels[non_NHs_non_OXT])
    assert not _s.intersection({'H', 'OXT'}), _s

    non_sidechains = np.where(np.isin(atom_labels, backbone_atoms,))[0]
    all_sidechains = np.where(np.in1d(
        atom_labels,
        ('N', 'CA', 'C', 'O', 'H1', 'H2', 'H3', 'OXT', 'H'),
        invert=True,
        ))

    # for the first residue
    _N_CA_idx = np.where(np.isin(atom_labels, ('N', 'CA')))[0][:2]
    assert len(_N_CA_idx) == 2, _N_CA_idx

    _final_idx = np.where(np.isin(atom_labels, ('CB', 'HA3')))[0][0:1]
    assert len(_final_idx) == 1

    # used to rotate the N-terminal Hs to -60 degrees to  HA during
    # the building process. Only used for Pro and Gly
    # here we capture H2 because H2 exists in all atoms, so we don't break
    # the flow (API). For Pro and Gly, these attributes are not used and
    # that is it.
    _H2_idx = np.where(atom_labels == 'H2')[0]
    assert len(_H2_idx) == 1
    H2_N_CA_CB = list(_H2_idx) + list(_N_CA_idx) + list(_final_idx)
    assert len(H2_N_CA_CB) == 4

    Hterm = np.where(np.isin(atom_labels, ('H1', 'H2', 'H3')))[0]
    assert len(Hterm) in (2, 3)  # proline as no H1.

    conf_mask = ConfMasks(
        all=_all,
        bb3=bb3,
        bb4=bb4,
        NHs=NHs,
        COs=COs,
        Hterm=Hterm,
        OXT1=OXT1,
        OXT2=OXT2,
        cterm=cterm,
        non_Hs=non_Hs,
        non_Hs_non_OXT=non_Hs_non_OXT,
        non_NHs_non_OXT=non_NHs_non_OXT,
        non_sidechains=non_sidechains,
        H2_N_CA_CB=H2_N_CA_CB,
        all_sidechains=all_sidechains,
        )

    return conf_mask


[docs]def init_conflabels(*args, **kwargs):
    """
    Create atom and residue labels from sequence.

    Parameters
    ----------
    *args, **kwargs
        Whichever `:func:create_conformer_labels` accepts.

    Returns
    -------
    namedtuple
        ConfLabels named tuple populated according to input sequence.

    See Also
    --------
    create_conformer_labels()
    ConfLabels
    """
    return ConfLabels(*create_conformer_labels(*args, **kwargs))


[docs]def create_conformer_labels(
        input_seq,
        atom_names_definition,
        transfunc=translate_seq_to_3l
        ):
    """
    Create all atom/residue labels model based on an input sequence.

    The labels are those expected for a all atom model PDB file. Hence,
    residue labels are repeated as needed in order to exist one residue
    label/number per atom.

    Parameters
    ----------
    input_seq : str
        The protein input sequence in 1-letter code format.

    atom_names_definition : dict
        Keys are residue identity and values are list/tuple of strings
        identifying atoms. Atom names should be sorted by the desired
        order.

    transfunc : func
        Function used to translate 1-letter input sequence to 3-letter
        sequence code.

    Returns
    -------
    tuple (atom labels, residue numbers, residue labels)
        Each is a np.ndarray of types: '<U4', int, and '<U3' and
        shape (N,) where N is the number of atoms.
        The three arrays have the same length.
    """
    input_seq_3_letters = transfunc(input_seq)  # note HIS treated as HIP

    # change "H" to "p" as idpconfgen builds histidines as HIP,
    # see `translate_seq_to_3l`
    # can be changed later after TODO for `translate_seq_to_3l` is completed
    input_seq = input_seq.replace("H", "p")

    # /
    # prepares data based on the input sequence
    # considers sidechain all-atoms
    atom_labels = np.array(
        make_list_atom_labels(
            input_seq,
            atom_names_definition,
            )
        )
    num_atoms = len(atom_labels)

    # /
    # per atom labels
    residue_numbers = np.empty(num_atoms, dtype=int)
    residue_labels = np.empty(num_atoms, dtype='<U3')

    # generators
    _res_nums_gen = gen_residue_number_per_atom(atom_labels, start=1)
    _res_labels_gen = \
        gen_3l_residue_labels_per_atom(input_seq_3_letters, atom_labels)

    # fills empty arrays from generators
    _zipit = zip(range(num_atoms), _res_nums_gen, _res_labels_gen)
    for _i, _num, _label in _zipit:
        residue_numbers[_i] = _num
        residue_labels[_i] = _label

    # maniatic cleaning from pre-function isolation
    del _res_labels_gen, _res_nums_gen, _zipit

    # ensure
    assert len(residue_numbers) == num_atoms
    assert len(residue_labels) == num_atoms, (len(residue_labels), num_atoms)
    # ?
    return atom_labels, residue_numbers, residue_labels


[docs]def make_list_atom_labels(input_seq, atom_labels_dictionary):
    """
    Make a list of the atom labels for an `input_seq`.

    Considers the N-terminal to be protonated H1 to H3, or H1 only
    for the case of Proline. Adds also 'OXT' terminal label.

    Parameters
    ----------
    input_seq : str
        1-letter amino-acid sequence.

    atom_labels_dictionary : dict
        The ORDERED atom labels per residue.

    Returns
    -------
    list
        List of consecutive atom labels for the protein.
    """
    labels = []
    LE = labels.extend

    first_residue_atoms = atom_labels_dictionary[input_seq[0]]

    # the first residue is a special case, we add here the three protons
    # for consistency with the forcefield
    # TODO: parametrize? multiple forcefields?
    for atom in first_residue_atoms:
        # however, pro.pdb does not have an "H" atom.
        if atom == 'H' and input_seq[0] != "P":
            LE(('H1', 'H2', 'H3'))
        else:
            labels.append(atom)

    if input_seq[0] == "P":
        LE(("H2", "H3"))

    for residue in input_seq[1:]:
        LE(atom_labels_dictionary[residue])

    labels.append('OXT')

    assert Counter(labels)['N'] == len(input_seq)
    assert labels[-1] == 'OXT'
    if input_seq[0] != "P":
        assert 'H1' in labels
    assert 'H2' in labels
    assert 'H3' in labels
    return labels


[docs]def gen_residue_number_per_atom(atom_labels, start=1):
    """
    Create a list of residue numbers based on atom labels.

    This is a contextualized function, not an abstracted one.
    Considers `N` to be the first atom of the residue.

    Yields
    ------
    ints
        The integer residue number per atom label.
    """
    assert atom_labels[0] == 'N', atom_labels[0]

    # creates a seamless interface between human and python 0-indexes
    start -= 1
    for al in atom_labels:
        if al == 'N':
            start += 1
        yield start


[docs]def gen_3l_residue_labels_per_atom(
        input_seq_3letter,
        atom_labels,
        ):
    """
    Generate residue 3-letter labels per atom.

    Parameters
    ----------
    input_seq_3letter : list of 3letter residue codes
        Most not be a generator.

    atom_labels : list or tuple of atom labels
        Most not be a generator.

    Yields
    ------
    String of length 3
         The 3-letter residue code per atom.
    """
    _count_labels = Counter(atom_labels)['N']
    _len = len(input_seq_3letter)
    assert _count_labels == _len, (_count_labels, _len)

    counter = -1
    for atom in atom_labels:
        if atom == 'N':
            counter += 1
        yield input_seq_3letter[counter]


[docs]def get_cycle_distances_backbone():
    """
    Return an inifinite iterator of backbone atom distances.

    Sampling, in order, distances between atom pairs:
        - N - Ca, used for OMEGA
        - Ca - C, used for PHI
        - C - N(+1), used for PSI
    """
    return cycle((
        distances_N_CA,  # used for OMEGA
        distances_CA_C,  # used for PHI
        distances_C_Np1,  # used for PSI
        ))


[docs]def get_cycle_bond_type():
    """
    Return an infinite interator of the bond types.

    Labels returns are synced with bgeo library.
    See `core.definitions.bgeo_*`.
    """
    return cycle((
        bgeo_Cm1NCa,  # used for OMEGA
        bgeo_NCaC,  # used for PHI
        bgeo_CaCNp1,  # used for PSI
        ))


[docs]def prepare_slice_dict(
        primary,
        input_seq,
        csss=False,
        dssp_regexes=None,
        secondary=None,
        mers_size=(1, 2, 3, 4, 5),
        res_tolerance=None,
        ncores=1,
        ):
    """
    Prepare a dictionary mapping fragments to slices in `primary`.

    Protocol:

    1) The input sequence is split into all different possible smaller peptides
    according to the `mers_size` tuple. Let's call these smaller peptides,
    XMERS.

    2) We search in the `primary` string where are all possible sequence matches
    for each of the XMERS. And, we consider possible residue substitution in the
    XMERS according to the `res_tolerance` dictionary, if given.

    2.1) the found positions are saved in a list of slice objects that populates
    the final dictionary (see Return section).

    2.2) We repeat the process but considering the XMER can be followed by a
    proline.

    2.3) We save in the dictionary only those entries for which we find matches
    in the `primary`.

    3) optional if `csss` is given. Here, we rearrange the output dictionary so
    it becomes compatible with the build process. The process goes as follows:

    3.1) For each slice found in 2) we inspect the `secondary` string if it
    matches any of the `dssp_regex`. If it matches we consider that slice to the
    fragment-size, the XMER identify, the DSSP key. This allow us in the build
    process to specify which SS to sample for specific regions of the conformer.

    Parameters
    ----------
    primary : str
        A concatenated version of all primary sequences in the database.
        In the form of "QWERY|IPASDF", etc.

    input_seq : str
        The 1-letter code amino-acid sequence of the conformer to construct.

    csss : bool
        Whether to update the output according ot the CSSS probabilities of
        secondary structures per amino acid residue position. Will only be used
        when CSSS is activated.

    dssp_regexes : list-like
        List of all DSSP codes to look for in the sequence.
        Will only be used when `csss` is True.

    secondary : str
        A concatenated version of secondary structure codes that correspond to
        primary. In the form of "LLLL|HHHH", etc. Only needed if `csss` True.

    mers_size : iterable
        A iterable of integers denoting the size of the fragments to search
        for. Defaults from 1 to 5.

    res_tolerance : dict
        A dictionary mapping residue tolerances, for example:
        {"A": "AIL"}, noting Ala can be replaced by Ile and Leu in the
        search (this is a dummy example).

    ncores : int
        The number of processors to use.

    Return
    ------
    dict
        A dict with the given mapping:

            1) First key-level of the dict is the length of the fragments,
            hence, integers.

            2) The second key level are the residue fragments found in the
            `primary`. A fragment in input_seq but not in `primary` is removed
            from the dict.

            3) only if `csss` is True. Adds a new layer organizing the slice
            objects with the SS keys.
    """
    res_tolerance = res_tolerance or {}

    log.info('preparing regex xmers')

    # gets all possible xmers from the input sequence (with overlap)
    mers_size = make_iterable(mers_size)
    xmers = [tuple(get_mers(input_seq, i)) for i in mers_size]

    combined_dssps = make_combined_regex(dssp_regexes)
    log.info(S(f"searching database with DSSP regexes: {combined_dssps}"))

    consume = partial(
        populate_dict_with_database,
        res_tolerance=res_tolerance,
        primary=primary,
        secondary=secondary,
        combined_dssps=combined_dssps,
        )

    execute = pool_function(consume, xmers, ncores=ncores)
    slice_dict = {}
    for lmer_size, slice_dict_ in execute:
        slice_dict[lmer_size] = slice_dict_

    # add single residue in case not selected by the user
    if 1 not in slice_dict:
        xmers_1 = tuple(get_mers(input_seq, 1))
        _lmer_ignore, _pop = populate_dict_with_database(
            xmers_1,
            res_tolerance=None,
            primary=primary,
            secondary=secondary,
            combined_dssps="L",
            )
        slice_dict[1] = _pop

    # from pprint import pprint
    # with open('superdict', 'w') as fout:
    #     pprint(slice_dict, stream=fout)

    if csss:
        ss_dict = {}
        csss_slice_dict = defaultdict(dict)
        for lmer in slice_dict:  # lmer = 1, 2, 3, 4, 5
            for aa in slice_dict[lmer]:  # aa = altered_mer or altered_mer_P
                # each sequence should have a new ss_dict
                ss_dict = defaultdict(list)
                for sd in slice_dict[lmer][aa]:  # sd is a slice() object
                    # here we cannot use the trick with `combined_dssps`
                    # because we need to know exactly which dssp we have found
                    for ss in dssp_regexes:
                        if re.fullmatch(f"[{ss}]+", secondary[sd]):
                            # save sd to ss_dict
                            ss_dict[ss].append(sd)
                csss_slice_dict[lmer][aa] = ss_dict

        return csss_slice_dict

    return slice_dict


[docs]def prepare_energy_function(
        atom_labels,
        residue_numbers,
        residue_labels,
        forcefield,
        lj_term=True,
        coulomb_term=False,
        energy_type_ij=default_post_calc_option,
        **kwnull,
        ):
    """
    Prepare energy function.

    Parameters
    ----------
    lj_term : bool
        Whether to compute the Lennard-Jones term during building and
        validation. If false, expect a physically meaningless result.

    coulomb_term : bool
        Whether to compute the Coulomb term during building and
        validation. If false, expect a physically meaningless result.

    energy_type_ij : str
        How to calculate the energy for `ij` pairs. See
        `libs.libenergyij.post_calc_options`.
    """
    # this mask identifies covalently bonded pairs and pairs two bonds apart
    bonds_le_2_mask = create_bonds_apart_mask_for_ij_pairs(
        atom_labels,
        residue_numbers,
        residue_labels,
        forcefield.bonds_le2_intra,
        bonds_le_2_inter,
        base_bool=False,
        )

    # this mask identifies pairs exactly 3 bonds apart
    bonds_exact_3_mask = create_bonds_apart_mask_for_ij_pairs(
        atom_labels,
        residue_numbers,
        residue_labels,
        forcefield.bonds_eq3_intra,
        bonds_equal_3_inter,
        )

    # /
    # assemble energy function
    energy_func_terms = []

    if lj_term:

        acoeff, bcoeff = create_LJ_params_raw(
            atom_labels,
            residue_numbers,
            residue_labels,
            forcefield.forcefield,
            )

        # 0.2 as 0.4
        _lj14scale = float(forcefield.forcefield['lj14scale'])
        acoeff[bonds_exact_3_mask] *= _lj14scale * 0.2
        bcoeff[bonds_exact_3_mask] *= _lj14scale * 0.2
        acoeff[bonds_le_2_mask] = np.nan
        bcoeff[bonds_le_2_mask] = np.nan

        lf_calc = init_lennard_jones_calculator(
            acoeff,
            bcoeff,
            postf=energy_type_ij,
            )
        energy_func_terms.append(lf_calc)
        log.info('prepared lj')

    if coulomb_term:

        charges_ij = create_Coulomb_params_raw(
            atom_labels,
            residue_numbers,
            residue_labels,
            forcefield.forcefield,
            )

        charges_ij[bonds_exact_3_mask] *= float(forcefield.forcefield['coulomb14scale'])  # noqa: E501
        charges_ij[bonds_le_2_mask] = np.nan

        coulomb_calc = init_coulomb_calculator(charges_ij, postf=energy_type_ij)
        energy_func_terms.append(coulomb_calc)
        log.info('prepared Coulomb')

    # in case there are iji terms, I need to add here another layer
    calc_energy = energycalculator_ij(
        calc_all_vs_all_dists_njit,
        energy_func_terms,
        )
    log.info('done preparing energy func')
    return calc_energy


[docs]def create_bonds_apart_mask_for_ij_pairs(
        atom_labels,
        residue_numbers,
        residue_labels,
        bonds_intra,
        bonds_inter,
        base_bool=False,
        ):
    """
    Create bool mask array identifying the pairs X bonds apart in ij pairs.

    Given `bonds_intra` and `bonds_inter` criteria, idenfities those ij
    atom pairs in N*(N-1)/2 condition (upper all vs all diagonal) that
    agree with the described bonds.

    Inter residue bonds are only considered for consecutive residues.

    Parameters
    ----------
    atom_labels : iterable, list or np.ndarray
        The protein atom labels. Ex: ['N', 'CA, 'C', 'O', 'CB', ...]

    residue_numbers : iterable, list or np.ndarray
        The protein residue numbers per atom in `atom_labels`.
        Ex: [1, 1, 1, 1, 1, 2, 2, 2, 2, ...]

    residue_labels : iterable, list or np.ndarray
        The protein residue labels per atom in `atom_labels`.
        Ex: ['Met', 'Met', 'Met', ...]

    See Also
    --------
    :py:func:`gen_ij_pairs_upper_diagonal`
    :py:func:`gen_atom_pair_connectivity_masks`
    """
    atom_labels_ij_gen = gen_ij_pairs_upper_diagonal(atom_labels)
    residue_numbers_ij_gen = gen_ij_pairs_upper_diagonal(residue_numbers)
    residue_labels_ij_gen = gen_ij_pairs_upper_diagonal(residue_labels)

    bonds_indexes_gen = gen_atom_pair_connectivity_masks(
        residue_labels_ij_gen,
        residue_numbers_ij_gen,
        atom_labels_ij_gen,
        bonds_intra,
        bonds_inter,
        )

    num_ij_pairs = len(atom_labels) * (len(atom_labels) - 1) // 2
    other_bool = not base_bool
    bonds_mask = np.full(num_ij_pairs, base_bool)

    for idx in bonds_indexes_gen:
        bonds_mask[idx] = other_bool

    return bonds_mask


[docs]def gen_ij_pairs_upper_diagonal(data):
    """
    Generate upper diagonal ij pairs in tuples.

    The diagonal is not considered.

    Yields
    ------
    tuple of length 2
        IJ pairs in the form of N*(N-1) / 2.
    """
    for i in range(len(data) - 1):
        for j in range(i + 1, len(data)):
            yield (data[i], data[j])


[docs]def gen_atom_pair_connectivity_masks(
        res_names_ij,
        res_num_ij,
        atom_names_ij,
        connectivity_intra,
        connectivity_inter,
        ):
    """
    Generate atom pair connectivity indexes.

    Given atom information for the ij pairs and connectivity criteria,
    yields the index of the ij pair if the pair is connected according
    to the connectivity criteria.

    For example, if the ij pair is covalently bonded, or 3 bonds apart,
    etc.

    Parameters
    ----------
    res_names_ij
    res_num_ij,
    atom_names_ij, iterables of the same length and synchronized information.

    connectivity_intra,
    connectivity_inter, dictionaries mapping atom labels connectivity

    Depends
    -------
    `are_connected`
    """
    zipit = zip(res_names_ij, res_num_ij, atom_names_ij)
    counter = 0
    for (rn1, _), (n1, n2), (a1, a2) in zipit:

        found_connectivity = are_connected(
            int(n1),
            int(n2),
            rn1,
            a1,
            a2,
            connectivity_intra,
            connectivity_inter,
            )

        if found_connectivity:
            yield counter

        counter += 1


[docs]def create_LJ_params_raw(
        atom_labels,
        residue_numbers,
        residue_labels,
        force_field,
        ):
    """Create ACOEFF and BCOEFF parameters."""
    sigmas_ii = extract_ff_params_for_seq(
        atom_labels,
        residue_numbers,
        residue_labels,
        force_field,
        'sigma',
        )

    epsilons_ii = extract_ff_params_for_seq(
        atom_labels,
        residue_numbers,
        residue_labels,
        force_field,
        'epsilon',
        )

    num_ij_pairs = len(atom_labels) * (len(atom_labels) - 1) // 2
    # sigmas
    sigmas_ij_pre = np.empty(num_ij_pairs, dtype=np.float64)
    sum_upper_diagonal_raw_njit(np.array(sigmas_ii), sigmas_ij_pre)
    #
    # epsilons
    epsilons_ij_pre = np.empty(num_ij_pairs, dtype=np.float64)
    multiply_upper_diagonal_raw_njit(
        np.array(epsilons_ii),
        epsilons_ij_pre,
        )
    #

    # mixing rules
    epsilons_ij = epsilons_ij_pre ** 0.5
    # mixing + nm to Angstrom converstion
    # / 2 and * 10
    sigmas_ij = sigmas_ij_pre * 5

    acoeff = 4 * epsilons_ij * (sigmas_ij ** 12)
    bcoeff = 4 * epsilons_ij * (sigmas_ij ** 6)

    return acoeff, bcoeff


[docs]def create_Coulomb_params_raw(
        atom_labels,
        residue_numbers,
        residue_labels,
        force_field,
        ):
    """."""
    charges_i = extract_ff_params_for_seq(
        atom_labels,
        residue_numbers,
        residue_labels,
        force_field,
        'charge',
        )

    num_ij_pairs = len(atom_labels) * (len(atom_labels) - 1) // 2
    charges_ij = np.empty(num_ij_pairs, dtype=np.float64)
    multiply_upper_diagonal_raw_njit(charges_i, charges_ij)
    charges_ij *= 0.25  # dielectic constant

    return charges_ij


[docs]def extract_ff_params_for_seq(
        atom_labels,
        residue_numbers,
        residue_labels,
        force_field,
        param,
        ):
    """
    Extract a parameter from forcefield dictionary for a given sequence.

    Parameters
    ----------
    atom_labels, residue_numbers, residue_labels
        As returned by `:func:create_conformer_labels`.

    forcefield : dict

    param : str
        The param to extract from forcefield dictionary.

    See Also
    --------
    :py:func:`create_conformer_labels`
    """
    params_l = []
    params_append = params_l.append

    zipit = zip(atom_labels, residue_numbers, residue_labels)
    for atom_name, res_num, res_label in zipit:

        # adds C to the terminal residues
        if res_num == residue_numbers[-1]:
            res = 'C' + res_label
            was_in_C_terminal = True
            assert res.isupper() and len(res) == 4, res

        elif res_num == residue_numbers[0]:
            res = 'N' + res_label
            was_in_N_terminal = True
            assert res.isupper() and len(res) == 4, res

        else:
            res = res_label

        # TODO:
        # define protonation state in parameters
        if res_label.endswith('HIS'):
            res_label = res_label[:-3] + 'HIP'

        try:
            # force field atom type
            atype = force_field[res][atom_name]['type']

        # TODO:
        # try/catch is here to avoid problems with His...
        # for this purpose we are only using side-chains
        except KeyError:
            raise KeyError(tuple(force_field[res].keys()))

        params_append(float(force_field[atype][param]))

    assert was_in_C_terminal, \
        'The C terminal residue was never computed. It should have.'
    assert was_in_N_terminal, \
        'The N terminal residue was never computed. It should have.'

    assert isinstance(params_l, list)
    return params_l


[docs]def are_connected(n1, n2, rn1, a1, a2, bonds_intra, bonds_inter):
    """
    Detect if a certain atom pair is bonded accordind to criteria.

    Considers only to the self residue and next residue
    """
    # requires
    assert isinstance(n1, int) and isinstance(n2, int), (type(n1), type(n2))
    assert all(isinstance(i, str) for i in (rn1, a1, a2)), \
        (type(i) for i in (rn1, a1, a2))
    assert all(isinstance(i, dict) for i in (bonds_intra, bonds_inter)), \
        (type(i) for i in (bonds_intra, bonds_inter))

    answer = (
        (n1 == n2 and a2 in bonds_intra[rn1][a1])
        or (
            n1 + 1 == n2
            and (
                a1 in bonds_inter  # this void KeyError
                and a2 in bonds_inter[a1]
                )
            )
        )

    assert isinstance(answer, bool)
    return answer


[docs]def create_sidechains_masks_per_residue(
        residue_numbers,
        atom_labels,
        backbone_atoms,
        ):
    """
    Create a map of numeric indexing masks pointing to side chains atoms.

    Create separate masks per residue.

    Parameters
    ----------
    residue_numbers : np.ndarray, shape (N,)
        The atom residue numbers of the protein.

    atom_labels : np.ndarray, shape (N,)
        The atom labels of the protein.

    backbone_atoms : list or tuple
        The labels of all possible backbone atoms.

    Returns
    -------
    list of tuples of length 2
        List indexes refer to protein residues, index 0 is residue 1.
        Per residue, a tuple of length 2 is given. Tuple index 0 are
        the indexes of that residue sidechain atoms mapped to an array
        of the `atom_labels` and `residue_numbers` characteristics.
        The tuple index 1 is an array of length M, where M is the number
        of sidechain atoms for that residue, defaults to np.nan.
    """
    assert residue_numbers.size == atom_labels.size
    assert type(backbone_atoms) in (list, tuple, np.ndarray)

    ss = []
    ssa = ss.append

    is_backbone = np.isin(atom_labels, backbone_atoms)
    is_sidechain = np.logical_not(is_backbone)

    float64 = np.float64
    full = np.full
    logical_and = np.logical_and
    nan = np.nan
    where = np.where

    # sorted here is mandatory because ss indexes must follow the residue
    # numbering
    for resnum in sorted(list(set(residue_numbers))):
        is_residue = residue_numbers == resnum
        bool_result = logical_and(is_residue, is_sidechain)
        ssa((
            where(bool_result)[0],
            full((np.sum(bool_result), 3), nan, dtype=float64)
            ))

    return ss


# njit available
[docs]def get_indexes_from_primer_length(
        sequence,
        plen,
        current_residue,
        ):
    """Get sequence fragment based on position and length."""
    if plen == 1:
        return current_residue
    elif plen == 2:
        return sequence[current_residue: current_residue + 2]
    elif plen == 3:
        return sequence[current_residue - 1: current_residue + 3]
    elif plen == 4:
        return sequence[current_residue - 1: current_residue + 4]
    elif plen == 5:
        return sequence[current_residue - 2: current_residue + 5]
    elif plen == 6:
        return sequence[current_residue - 2: current_residue + 6]
    elif plen == 7:
        return sequence[current_residue - 3: current_residue + 7]


get_idx_primer_njit = njit(get_indexes_from_primer_length)


[docs]def make_combined_regex(regexes):
    """
    Make a combined regex with ORs.

    To be used with re.fullmatch.
    """
    if len(regexes) > 1:
        regexes_ = (f'[{r}]*' if len(r) > 1 else f'{r}+' for r in regexes)
        return "(" + '|'.join(regexes_) + ")"
    if len(regexes) == 1:
        return "([" + regexes[0] + "]+)"
    else:
        raise AssertionError("Regexes are expected to have something.")


[docs]def populate_dict_with_database(
        xmers,
        res_tolerance,
        primary,
        secondary,
        combined_dssps):
    """
    Identify sampling positions.

    Identifies slices in `primary` and `secondary` where the `combined_dssps`
    regexes apply. Considers also residue tolerance.

    This function is used internally for `prepare_slice_dict` with
    multiprocessing.

    Parameters
    ----------
    xmers : list
        The list of all protein fragments we want to search in the `primary`
        and `secondary` "database" strings. This list should contain only
        sequence of the same length.

    combined_dssps : string
        A string regex prepared with all the combined DSSPs that need to
        be searched: see code in `prepare_slice_dict` and `make_combined_regex`.

    others
        Other parameters are like described in `prepare_slice_dict`.

    Returns
    -------
    int, dict
        The length of the xmers in `xmers` list.
        The dictionary with the identified slice positions.
    """
    slice_dict = {}
    lmer = len(xmers[0])  # all xmers are expected to have the same length
    for mer in xmers:

        if res_tolerance:
            altered_mer = build_regex_substitutions(mer, res_tolerance)
        else:
            altered_mer = mer

        merregex = f'(?=({altered_mer}))'

        slices_ = regex_forward_with_overlap(primary, merregex)
        lmer_alter_list = slice_dict.setdefault(altered_mer, [])
        for slc_ in slices_:  # ultra-slow
            if re.fullmatch(combined_dssps, secondary[slc_]):
                lmer_alter_list.append(slc_)
        if not lmer_alter_list:
            slice_dict.pop(altered_mer)

        # this is a trick to find the sequence that are proceeded
        # by Proline. Still, these are registered in the "lmer" size
        # without consider the proline addition.
        # this is a combo with get_adjancent_angles
        merregex_P = f'(?=({altered_mer}P))'
        altered_mer_P = f'{altered_mer}_P'

        slices_ = regex_forward_with_overlap(primary, merregex_P)
        lmer_alter_list = slice_dict.setdefault(altered_mer_P, [])
        for slc_ in slices_:  # ultra-slow
            if re.fullmatch(combined_dssps, secondary[slc_]):
                lmer_alter_list.append(slc_)
        if not lmer_alter_list:
            slice_dict.pop(altered_mer_P)

    return lmer, slice_dict