Source code for idpconfgen.libs.libvalidate

"""
Tools to validate conformers.

Recognition of this module should be grated to:
    - @AlaaShamandy
    - @joaomcteixeira

    Please see:
    https://github.com/julie-forman-kay-lab/IDPConformerGenerator/pull/23
"""
import numpy as np
from scipy.spatial import distance

from idpconfgen import Path
from idpconfgen.core.build_definitions import (
    average_distance_C_Np1,
    average_distance_CA_C,
    average_distance_N_CA,
    )
from idpconfgen.core.definitions import vdW_radii_dict
from idpconfgen.libs.libstructure import (
    Structure,
    col_element,
    col_name,
    col_resSeq,
    cols_coords,
    generate_backbone_pairs_labels,
    )


# Notes on improving distance calculation and vdW energy function
# https://stackoverflow.com/questions/37298710/fastest-way-to-compute-distance-beetween-each-points-in-python
# https://stackoverflow.com/questions/37794849/efficient-and-precise-calculation-of-the-euclidean-distance


[docs]def vdw_clash_by_threshold_common_preparation( protein_atoms, protein_elements, residue_numbers, atoms_to_consider=False, elements_to_consider=False, residues_apart=2, vdW_radii='tsai1999', ): """ Prepare masks for vdW clash calculation. Masks are prepared considering all-to-all distances will be computed using scipy.distance.cdist, so a (N, 3) array originates a (N, N) distance result. `atoms_to_consider` and `elements_to_consider` are evaluated with logical AND, that is, only entries that satisfy both are considered. Parameters ---------- protein_atoms : np.array, of shape (N,), dtype string A sequence of the protein atom names. protein_elements : np.array, of shape (N,), dytpe compatible with '<U2' A sequence of the protein atom elements aligned with `protein_atoms`. residue_numbers : np.array of shape (N,), dtype=int atoms_to_consider : sequence, optional A tuple of the atom names to consider in the calculation. Defaults to FALSE, considers all atoms. elements_to_consider : sequence, optional A tuple of the element types to consider in the calculation. Defaults to FALSE, considers all elements. See Also -------- :py:func:`vdw_clash_by_threshold_calc`. """ # uses all options if values evaluate to FALSE atoms_to_consider = atoms_to_consider or protein_atoms elements_to_consider = elements_to_consider or protein_elements atoms_to_analyze = np.logical_and( np.isin(protein_elements, elements_to_consider), np.isin(protein_atoms, atoms_to_consider), ) # computes only pairs of atoms where both atoms are in atoms_to_consider atc_mask = np.logical_and.outer(atoms_to_analyze, atoms_to_analyze) vdW_vector = np.zeros(protein_atoms.size) for atom, radius in vdW_radii_dict[vdW_radii].items(): vdW_vector[protein_elements == atom] = radius pure_radii_sum = np.add.outer(vdW_vector, vdW_vector) residue_distances = np.subtract.outer(residue_numbers, residue_numbers) return atc_mask, pure_radii_sum, residue_distances >= residues_apart
[docs]def vdw_clash_by_threshold_calc( coords, atc_mask, pure_radii_sum, distances_apart, vdW_overlap=0.0, ): """ Calculate van der Waals clashes from a pure sphere overlap. Other masks used as parameters will be applied to the result of: scipy.distance.cdist(coords, coords, 'euclidean') Parameters ---------- coords : np.array, dtype=np.float, shape (N, 3) The protein XYZ coordinates. atc_mask : np.array, dtype=np.bool, shape (N, N) A boolean masks to filter only the atoms relevant to report. Usually this mask is prepared beforehand and can contain different considerations, such as residues apart and especific atom types. If `coords` contain only the coordinates desired to compute, then `atc_mask` should contain only TRUE entries. pure_radii_sum : np.array, dtype=float, shape (N, N) An all-to-all sum of the vdW radii. In other words, the threeshold after which a clash is considered to exist for each atom pair, before applying `vdW_overlap` allowance. distances_apart : np.array, dtype=int, shape (N, N) An all-to-all atom-to-atom residue to residue distance matrix. vdW_overlap : float The vdW overlap tolerance to apply. Returns ------- tuple rows, cols : of cdist applied to `coords` where clashes where found. distances found for those clashes computed distance threshold overlap, computed overlap distance between threshold and distance """ distances = distance.cdist(coords, coords, 'euclidean') # computes clahses based on overlap, so we can apply a overlap # threshold overlap = pure_radii_sum - distances clashes_raw = overlap >= vdW_overlap rows_cols = np.logical_and( np.logical_and(clashes_raw, atc_mask), distances_apart, ).nonzero() rows, cols = rows_cols return ( rows, cols, distances[rows_cols], pure_radii_sum[rows_cols], overlap[rows_cols], )
[docs]def vdw_clash_by_threshold( coords, protein_atoms, protein_elements, atoms_to_consider, elements_to_consider, residue_numbers, residues_apart=2, vdW_radii='tsai1999', vdW_overlap=0.0, # 07/ago/2020, I have decided not to use **kwargs here # even if that requires a duplication of parameters in this function ): """ Calculate vdW clashes from XYZ coordinates and identity masks. Parameters ---------- coordinates : numpy array, dtype=float, shape (N, 3) The atom XYZ coordinates. protein_atoms : numpy array, dtype=str, shape (N,) The protein atom names. protein_elements : numpy array, dtype=str, shape(N,) The protein atom elements. atoms_to_consider : list-like The atoms in `protein_atoms` to consider in the vdW clash analysis. elements_to_consider : list-like The elements in `protein_elements` to consider in the vdW clash analysis. residue_number : numpy array, dtype=int, shape (N,) The residue number corresponding to each atom. residues_apart : int, optional The minimum number of residues apart to consider for a clash. Defaults to 2. vdW_radii : str, optional The VDW radii set to consider. Defaults to 'tsai1999'. vdW_overlap : float, optional An overlap allowance in Angstroms. Defaults to 0.0, any distance less than vdW+vdW is considered a clash. Returns ------- Same :func:`vdw_clash_by_threshold_calc` returns. """ # The VDW clash analysis is separated in two steps, # 1) the preparation of the data and masks # 2) the actual clash calculation atc_mask, pure_radii_sum, distances_apart = \ vdw_clash_by_threshold_common_preparation( protein_atoms, protein_elements, residue_numbers, atoms_to_consider=atoms_to_consider, elements_to_consider=elements_to_consider, residues_apart=residues_apart, vdW_radii=vdW_radii, ) return vdw_clash_by_threshold_calc( coords, atc_mask, pure_radii_sum, distances_apart, vdW_overlap=vdW_overlap, )
[docs]def evaluate_vdw_clash_by_threshold_from_disk( name, pdb_data, atoms_to_consider, elements_to_consider, **kwargs, ): """ Evaluate clashes in a structure. Created to evaluate conformers from disk. See Also -------- :py:func:`validate_vdw_clashes`. """ s = Structure(pdb_data) s.build() da = s.data_array atom_names = da[:, col_name] atom_elements_pure = da[:, col_element] atom_elements_from_names = atom_names.astype('<U1') # elements_dont_exist = np.logical_not(atom_elements_pure.astype(np.bool)) # # ValueError: invalid literal for int() with base 10: # really? okay, lets do a list comprehension... elements_dont_exist = np.array([not bool(i) for i in atom_elements_pure]) # considers PDBs that lab element information best_observation_ele = np.zeros(atom_elements_pure.shape, dtype='<U2') best_observation_ele[:] = atom_elements_pure[:] best_observation_ele[elements_dont_exist] = \ atom_elements_from_names[elements_dont_exist] res_numbers = da[:, col_resSeq].astype(int) coords = s.coords rows, cols, distances, radii_sum, overlap = vdw_clash_by_threshold( coords, atom_names, # protein atoms best_observation_ele, # protein elements atoms_to_consider, # atoms to consider in the validation elements_to_consider, # elements to consider in the validation res_numbers, **kwargs, ) report = report_vdw_clash(da, rows, cols, distances, radii_sum, overlap) # rows.size is the number of clashes return name, rows.size, report
[docs]def validate_bb_bond_len(coords, tolerance=0.01): """ Validate backbone bond lengths of `coords`. Considers `coords` are already sorted to (N, CA, C) per residue. Considers only (N, CA, C) atoms are present in `coords`. Evalutes against N-CA, CA-C and C-Np1 distances used in IDPConfGen. Parameters ---------- tolerance : float A tolerance in the same units as `coords`. Conflicts under the tolerance are consider valid. Returns ------- np.array, dtype=bool, shape (N-1,) True if bond length is invalid. False if bond length is valid. np.array, dtype=np.float, shape (N-1,) The computed bond distances. np.array, dtype=np.float, shape (N-1,) The expected bond lengths """ expected_bond_length = np.tile( [ average_distance_N_CA, average_distance_CA_C, average_distance_C_Np1 ], coords.shape[0] // 3 ) # https://stackoverflow.com/questions/1401712 # bond_distances = np.linalg.norm(coords[:-1] - coords[1:], axis=1) _coords = coords[:-1] - coords[1:] bond_distances = np.sqrt(np.einsum("ij,ij->i", _coords, _coords)) # true when bond length is too different invalid = np.invert(np.isclose( bond_distances, expected_bond_length[:-1], atol=tolerance, )) return invalid, bond_distances, expected_bond_length[:-1]
[docs]def validate_bb_bonds_len_from_disk( name=None, pdb_data=None, tolerance=0.1, ): """Validate backbone bond lengths of a structure stored in disk.""" # read structure pdb = Path(name) if name else pdb_data s = Structure(pdb) s.build() s.add_filter_backbone(minimal=True) fa = s.filtered_atoms coords = s.get_sorted_minimal_backbone_coords(filtered=False) invalid, bond_distances, expected_bond_length = \ validate_bb_bond_len(coords, tolerance) labels = generate_backbone_pairs_labels(fa) report = report_sequential_bon_len( bond_distances, expected_bond_length, invalid, labels, ) return name, np.sum(invalid), report
[docs]def eval_bb_bond_length_distribution(name, pdb_data): """.""" s = Structure(pdb_data) s.build() coords = s.get_sorted_minimal_backbone_coords() _coords = coords[:-1] - coords[1:] return np.sqrt(np.einsum('ij,ij->i', _coords, _coords))
[docs]def report_vdw_clash(data_array, pair1, pair2, distances, radii_sum, overlap): """ Prepare a report of the identified clashes. Parameters ---------- data_array : np.ndarray, shape (N, M) A numpy data_array as given by :py:attr:`idpconfgen.libs.libstructure.Structure.data_array`. pair1, pair2 : ordered iterable of integers The row indexes where to retrieve atom information from `data_array´. pair1 and pair2 must be aligned in order for the report to make sense, that it, the first item of pair1 clashes with the first item of pair2. distances, threshold : indexable of length equal to pair1/2 length Contain distances and clash thresholds for the different identified clashes. Returns ------- str The report. """ report = [] for i, (p1, p2) in enumerate(zip(pair1, pair2)): n1, r1, a1 = data_array[p1, cols_coords] n2, r2, a2 = data_array[p2, cols_coords] report.append( f"{n1:>5} {r1:>3} {a1:>5} - " f"{n2:>5} {r2:>3} {a2:>5} - " f'r: {radii_sum[i]:.3f} - ' f'd: {distances[i]:.3f} - ' f'o: {overlap[i]:.3f}' ) return '\n'.join(report)
[docs]def report_sequential_bon_len( bond_distances, expected_bond_length, invalid_bool, labels, ): """ Generate a report from bond distances of sequential bonds. Returns ------- string """ BD = bond_distances[invalid_bool] EB = expected_bond_length[invalid_bool] DIFF = np.abs(BD - EB) LABELS = labels[invalid_bool] rows = ( f'{l} - f: {bd:.3f} e: {eb:.3f} - {d:.3f}' for bd, eb, d, l in zip(BD, EB, DIFF, LABELS) ) return '\n'.join(rows)
[docs]def validate_conformer_for_builder( coords, atom_labels, residue_numbers, bb_mask, carbonyl_mask, LOGICAL_NOT=np.logical_not, ISNAN=np.isnan, ): """.""" # no need to compute isnan in whole coordinates because coordinates # either are all nan or are all numbers, hence we use index 0 coords_in_usagee = LOGICAL_NOT(ISNAN(coords[:, 0])) assert coords_in_usagee.shape == (coords.shape[0], ) elements = atom_labels[coords_in_usagee].astype('<U1') # numeric elements refer to the different H atoms as described # in core/build_definitions.py elements[np.char.isnumeric(elements)] = 'H' rows, *_ = vdw_clash_by_threshold( coords[coords_in_usagee, :], atom_labels[coords_in_usagee], elements, False, False, residue_numbers[coords_in_usagee], residues_apart=2, ) if np.any(rows): # not valid return 1 # simulates positive energy else: return -1 # simulates negative energy