Source code for polymerist.molfiles.pdb.pdbatoms

'''PDB file atom line formatting tools'''

__author__ = 'Timotej Bernat'
__email__ = 'timotej.bernat@colorado.edu'

from typing import Union

from dataclasses import dataclass, field
from collections import Counter


# Column indices and expected types of pieces of information in PDB atom lines
PDB_ATOM_RECORD_TOKENS : dict[str, tuple[tuple[int, int], type]] = { 
    'Residue atom type'             : (( 1,  6), str),
    'Atom serial number'            : (( 7, 11), int),
    'Atom name'                     : ((13, 16), str),
    'Alternate location indicator'  : ((17, 17), str),
    'Residue name'                  : ((18, 20), str),
    'Chain identifier'              : ((22, 22), str),
    'Residue sequence number'       : ((23, 26), int),
    'Residue insertion code'        : ((27, 27), str),
    'X (angstrom)'                  : ((31, 38), float),
    'Y (angstrom)'                  : ((39, 46), float),
    'Z (angstrom)'                  : ((47, 54), float),
    'Occupancy'                     : ((55, 60), float),
    'Temperature factor'            : ((61, 66), float),
    'Segment identifier'            : ((73, 76), str),
    'Element symbol'                : ((77, 78), str),
    'Charge'                        : ((79, 80), str),
} # taken from PDB spec (https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html)


[docs]
def parse_pdb_atom_record(pdb_atom_record : str) -> dict[str, Union[str, int, float]]:
    '''Extracts informations (with correct type casting) from a PDB "ATOM" or "HETATM" record'''
    pdb_atom_record = pdb_atom_record.ljust(80) # ensure line if padded to 80 characters long to avoid IndexErrors
    
    atom_info = {} # TODO: add error handling for poorly-formatted atom records
    for field_name, ((col_start, col_end), cast_type) in PDB_ATOM_RECORD_TOKENS.items():
        field_value = pdb_atom_record[col_start-1:col_end].strip() # offset for 0-indexing
        if not field_value: # special cases for empty fields
            # no need to check for empty strings; these are allowed
            if cast_type == int:
                field_value = 0
            if cast_type == float:
                field_value = 0.0
        atom_info[field_name] = cast_type(field_value)
        
    return atom_info



[docs]
@dataclass(frozen=True)
class SerialAtomLabeller:
    '''
    For assigning unique numbered atom names based on their
    order of appearance within a molecule and elemental class
    
    Useful, for example, in generating unique atom names for a PDB file
    
    Parameters
    ----------
    atom_label_width : int , default 4      
        Exact length alloted for any generated atom label
        Labels shorter than this are right-padded with spaces,
        while labels longer than this are truncated
        
        Default of 4 is the chosen to be compatible with the PDB specification ("Atom name: lines 13-16, left-justified")
        https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html
    include_elem_idx : bool, default True  
        Whether to attach a numerical element-index postfix to atom labels
        
        E.g. with atom_label_width=4, the fifth carbon in a topology  
        will be labelled as "C004" with include_elem_idx=True, 
        while labelled as "C   " with include_elem_idx=False, 
    default_elem_idx : int, default 0
        Starting index for each element category
        By default, is 0-indexed; MUST BE POSITIVE
    '''
    atom_label_width : int = 4
    include_elem_idx : bool = True
    default_elem_idx : int = 0
    
    element_counter : Counter = field(init=False, default_factory=Counter)
    
    def __post_init__(self) -> None:
        '''Check ranges on input values'''
        if self.atom_label_width < 0:
            raise ValueError(f'Must provide a non-negative number of index digits to include (provided {self.atom_label_width})')

        if self.default_elem_idx < 0:
            raise ValueError(f'Must provide a non-negative starting index for element indices (provided {self.default_elem_idx})')
    

[docs]
    def get_atom_label(self, elem_symbol : str) -> str:
        '''
        Obtain a numbered atom label for an atom based on its element, 
        updating the underlying element context in the process
        '''
        if not isinstance(elem_symbol, str):
            raise TypeError(f'Must pass symbol of atom\'s element as str (not type {type(elem_symbol).__name__})')
        
        if elem_symbol not in self.element_counter: # initialize first occurence to starting value
            self.element_counter[elem_symbol] = self.default_elem_idx
            
        atom_idx_label : str = ''
        if self.include_elem_idx:
            atom_idx = self.element_counter[elem_symbol]
            num_idx_digits = max(self.atom_label_width - len(elem_symbol), 0) # number of symbols left over for an atom index
            atom_idx_label = f'{atom_idx:0{num_idx_digits}d}'
        
        atom_name = f'{elem_symbol}{atom_idx_label}'
        atom_name = atom_name.ljust(self.atom_label_width, ' ')[:self.atom_label_width] # pad with spaces if too short, or truncate if too long
        assert(len(atom_name) <= self.atom_label_width) # perfunctory check to make sure things are working as expected
        
        self.element_counter[elem_symbol] += 1 # update tally with addition of new occurence of a particular element
        
        return atom_name