Source code for idpconfgen.libs.libpdb

"""Contain  handlers of PDB information."""
import functools
import os
import re
from collections import defaultdict

from idpconfgen import Path, log
from idpconfgen.core import exceptions as EXCPTS
from idpconfgen.core.definitions import aa3to1
from idpconfgen.logger import S


RE_MODEL = re.compile('(\n|^)MODEL\s*\d+\s*\n')  # noqa: W605
RE_ENDMDL = re.compile('\nENDMDL\s*\n')  # noqa: W605


# string formats for atom name
_3 = ' {:<3s}'
_4 = '{:<4s}'
# len of element, atom formatting string
# anything outside this is an error
_atom_format_dict = {
    # len of element
    1: {
        # len of atom name
        1: _3,
        2: _3,
        3: _3,
        4: _4,
        },
    2: {
        1: _4,
        2: _4,
        3: _4,
        4: _4,
        },
    }


[docs]def delete_insertions(lines):
    """
    Delete insertions.

    Adapted from pdbtools and optimized for this context.

    Visit pdb-tools at:
    https://github.com/haddocking/pdb-tools/blob/master/pdbtools/pdb_delinsertion.py
    """  # noqa: E501
    # Keep track of residue numbering
    # Keep track of residues read (chain, resname, resid)
    offset = 0
    prev_resi = None
    seen_ids = set()
    clean_icode = False
    for line in lines:
        res_uid = line[17:27]  # resname, chain, resid
        id_res = line[21] + line[22:26].strip()  # A99, B12
        icode = line[26]

        # unfortunately, this is messy but not all PDB files follow a nice
        # order of ' ', 'A', 'B', ... when it comes to insertion codes..
        if prev_resi != res_uid:  # new residue
            # Have we seen this chain + resid combination
            # catch insertions WITHOUT icode ('A' ... ' ' ... 'B')
            if id_res in seen_ids:
                # Should we do something about it?
                clean_icode = True
                offset += 1
            # Do we have an explicit icode?
            elif icode != ' ':
                if id_res in seen_ids:  # never saw this, do not offset!
                    offset += 1
                clean_icode = True
            else:
                clean_icode = False

            prev_resi = res_uid

        if clean_icode:
            line = f'{line[:26]} {line[27:]}'

        resid = int(line[22:26]) + offset
        line = f'{line[:22]}{str(resid).rjust(4)}{line[26:]}'
        seen_ids.add(id_res)
        yield line


[docs]def format_atom_name(atom, element, AFD=_atom_format_dict):
    """
    Format PDB Record line Atom name.

    Further Reading:

    * https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html

    Parameters
    ----------
    atom : str
        The atom name.

    element : str
        The atom element code.

    Returns
    -------
    str
        Formatted atom name.
    """
    atm = atom.strip()
    len_atm = len(atm)
    len_ele = len(element.strip())

    try:
        return AFD[len_ele][len_atm].format(atm)
    except KeyError as err:
        _ = f'Could not format this atom:type -> {atom}:{element}'
        # raising KeyError assures that no context in IDPConfGen
        # will handle it. @joaomcteixeira never handles pure Python
        # exceptions, those are treated as bugs.
        raise KeyError(_) from err


[docs]def format_chainid(chain):
    """
    Format chain identifier to one letter.

    This is required to receive chain IDs from mmCIF files, which
    may have more than one letter.
    """
    return chain.strip()[0]


atom_record = slice(0, 6)
atom_serial = slice(6, 11)
atom_name = slice(12, 16)
atom_altLoc = slice(16, 17)
atom_resName = slice(17, 20)
atom_chainID = slice(21, 22)
atom_resSeq = slice(22, 26)
atom_iCode = slice(26, 27)
atom_x = slice(30, 38)
atom_y = slice(38, 46)
atom_z = slice(46, 54)
atom_occ = slice(54, 60)
atom_temp = slice(60, 66)
atom_segid = slice(72, 76)
atom_element = slice(76, 78)
atom_model = slice(78, 80)

# order matters
atom_slicers = [
    atom_record,
    atom_serial,
    atom_name,
    atom_altLoc,
    atom_resName,
    atom_chainID,
    atom_resSeq,
    atom_iCode,
    atom_x,
    atom_y,
    atom_z,
    atom_occ,
    atom_temp,
    atom_segid,
    atom_element,
    atom_model,
    ]


atom_line_formatter = (
    "{:6s}"
    "{:5d} "
    "{}"
    "{:1s}"
    "{:3s} "
    "{:1s}"
    "{:4d}"
    "{:1s}   "
    "{:8.3f}"
    "{:8.3f}"
    "{:8.3f}"
    "{:6.2f}"
    "{:6.2f}      "
    "{:<4s}"
    "{:>2s}"
    "{:2s}"
    )


# functions to apply to each format field in atom line string
def _nothing(x):
    return x


atom_format_funcs = [
    _nothing, int, _nothing, _nothing, _nothing,
    format_chainid, int, _nothing, float, float,
    float, float, float, _nothing, _nothing,
    _nothing,
    ]


# USED OKAY
[docs]def get_fasta_from_PDB(pdbid):
    """Extract FASTA from PDB."""
    try:
        lines = pdbid[1].decode('utf_8').split(os.linesep)
    except AttributeError:
        lines = pdbid[1].split(os.linesep)
    rn = {line[atom_resSeq].strip(): line[atom_resName] for line in lines}
    fasta = (aa3to1.get(f, 'X') for f in rn.values())
    return Path(pdbid[0]).stem, ''.join(fasta)


[docs]def is_pdb(datastr):
    """Detect if `datastr` if a PDB format v3 file."""
    assert isinstance(datastr, str), \
        f'`datastr` is not str: {type(datastr)} instead'
    return bool(datastr.count('\nATOM ') > 0)


[docs]class PDBIDFactory:
    r"""
    Parse input for PDBID instatiation.

    Parameters
    ----------
    name : str or Path
        The code name or ID that identified the PDB.
        Possible formats:

            - XXXX
            - XXXXC\*
            - XXXX_C\*
            - \*.pdb

        where XXXX is the PDBID code, C is the chain ID and * means
        any number of characters. PDB and chaind ID codes are any digits,
        lower and upper case letters.

    Returns
    -------
    :class:`PDBID` object.

    """

    rgx_XXXX = re.compile(r'^[0-9a-zA-Z]{4}(\s|$)')
    rgx_XXXXC = re.compile(r'^[0-9a-zA-Z]{5,}(\s|$)')
    rgx_XXXX_C = re.compile(r'^[0-9a-zA-Z]{4}_[0-9a-zA-Z]+(\s|$)')
    rgx_XXXX_C_segS = re.compile(r'^[0-9a-zA-Z]{4}_[0-9a-zA-Z]+_seg\d+(\s|$)')
    rgx_any = re.compile(r'\w+(\.pdb)?')

    def __new__(cls, name):
        """Construct class."""
        if isinstance(name, PDBID):
            return name

        namep = Path(name)
        if namep.suffix in ('.pdb', '.cif'):
            name = namep.stem

        # where XXXX is the PDBID and C the chain ID
        pdb_filename_regex = {
            cls.rgx_XXXX: cls._parse_XXXX,
            cls.rgx_XXXXC: cls._parse_XXXXC,
            cls.rgx_XXXX_C: cls._parse_XXXX_C,
            cls.rgx_XXXX_C_segS: cls._parse_XXXX_C_segS,
            cls.rgx_any: cls._parse_any_name,
            }

        for regex, parser in pdb_filename_regex.items():
            if regex.search(str(name)):  # in case Path obj
                return PDBID(*parser(name))
        else:
            emsg = f"PDB code format not valid: {name}. No regex matches."
            raise EXCPTS.PDBIDFactoryError(emsg)

    @staticmethod
    def _parse_XXXX(pdbid):
        return pdbid[:4], None

    @staticmethod
    def _parse_XXXXC(pdbid):
        pdbinfo = pdbid.split()[0]
        return pdbinfo[:4], pdbinfo[4:]

    @staticmethod
    def _parse_XXXX_C(pdbid):
        pdbid, chainid, *_ = pdbid.split()[0].split('_')
        return pdbid, chainid

    @staticmethod
    def _parse_XXXX_C_segS(pdbid):
        pdbid, chainid, segID = pdbid.split()[0].split('_')
        return pdbid, chainid, segID.lstrip('seg')

    @staticmethod
    def _parse_any_name(pdbid):
        return [Path(pdbid).stem]


# USED OKAY
[docs]class PDBList:
    """
    List of PDBID objects.

    Parameters
    ----------
    pdb_names : obj:iterator
        An iterator containing the PDB names.
        PDB names can be in the form accepted by PDBIDFactory or
        PDBID objects.
    """

    def __new__(cls, pdb_names):  # noqa: D102

        if isinstance(pdb_names, cls):
            return pdb_names
        else:
            return super().__new__(cls)

    def __init__(self, pdb_names):
        valid_pdb_names = filter(
            # str() because may receive Paths
            lambda x: not str(x).startswith('#'),
            pdb_names,
            )
        self.set = set(PDBIDFactory(element) for element in valid_pdb_names)

    def __repr__(self):
        return '{}(\n    {})\n'.format(
            self.__class__.__name__,
            ',\n    '.join(repr(x) for x in self),
            )

    def __str__(self):
        return '{} with {} element(s)'.format(
            self.__class__.__name__,
            len(self),
            )

    def __eq__(self, other):
        try:
            return self.set == other.set
        except AttributeError:
            return self.set == other

    def __iter__(self):
        return iter(self.to_tuple())

    def __getitem__(self, index):
        return self.to_tuple()[index]

    def __len__(self):
        return len(self.set)

    @property
    def pdbids(self):
        """Generate the PDBID names."""
        return (pdbid.name for pdbid in self)

    @property
    def name_chains_dict(self):
        """Export PDBIDs: Chains dictionary map."""
        name_chains = defaultdict(list)
        for pdbid in self:
            pc = pdbid.chain
            if pc:
                name_chains[pdbid.name].append(pc)
            else:
                name_chains[pdbid.name]
        return name_chains

[docs]    def to_tuple(self):
        """Convert PDBList to sorted tuple."""
        return tuple(sorted(self.set))

[docs]    def difference(self, other):
        """
        Difference between self and other.

        Returns
        -------
        PDBList
        """
        return PDBList(tuple(self.set.difference(other.set)))

[docs]    def write(self, filename='PDBIDs.list'):
        """
        Write to a file the PDBIDs in the PDBList.

        Parameters
        ----------
        filename : str, optional
            The output file name.
        """
        with open(filename, 'w') as fh:
            fh.write('\n'.join(str(pdbid) for pdbid in self.to_tuple()))

        log.info(S(f'PDBIDs written to {filename}'))


# USED OKAY
[docs]@functools.total_ordering
class PDBID:
    """
    PDB object identifier.

    Identifies unique downloadable/stored units.

    In the current implmentation each unit is one PDB chain,
    which is identified by the PDBID and its chain identifier.

    Parameters
    ----------
    name : obj:`str`
        The PDBID, for example: 1ABC

    chain : obj:`str`
        The chain identifier.
        Defaults to None.

    Attributes
    ----------
    name:
        The four character PDB identifier.

    chain:
        The chain identifier.
    """

    # segment are fragments of the same chain
    def __init__(self, name, chain=None, segment=None):

        self.name = name.upper()
        self.chain = chain

        # made manual to completely control order
        ids = {
            'chain': chain,
            'seg': segment,
            }
        self.identifiers = {}

        for name, identifier in ids.items():
            if identifier:
                self.identifiers[name] = identifier

    def __repr__(self):

        iditems = self.identifiers.items()

        kwargs = ', '.join(f'{key}={val!r}' for key, val in iditems)

        if kwargs:
            kwargs = ', ' + kwargs

        return '{}(name={!r}{})'.format(
            self.__class__.__name__,
            self.name,
            kwargs,
            )

    def __lt__(self, other):
        return str(self) < str(other)

    def __hash__(self):
        return hash(str(self))

    def __eq__(self, other):
        return str(self) == str(other)

    def __str__(self):
        name = f'{self.name}'
        ids = '_'.join(self.identifiers.values())

        if ids:
            return f'{name}_' + ids
        else:
            return name