Source code for polymerist.molfiles.pdb.pdbatoms

'''PDB file atom line formatting tools'''

__author__ = 'Timotej Bernat'
__email__ = 'timotej.bernat@colorado.edu'

from typing import Union

from dataclasses import dataclass, field
from collections import Counter


# Column indices and expected types of pieces of information in PDB atom lines
PDB_ATOM_RECORD_TOKENS : dict[str, tuple[tuple[int, int], type]] = { 
    'Residue atom type'             : (( 1,  6), str),
    'Atom serial number'            : (( 7, 11), int),
    'Atom name'                     : ((13, 16), str),
    'Alternate location indicator'  : ((17, 17), str),
    'Residue name'                  : ((18, 20), str),
    'Chain identifier'              : ((22, 22), str),
    'Residue sequence number'       : ((23, 26), int),
    'Residue insertion code'        : ((27, 27), str),
    'X (angstrom)'                  : ((31, 38), float),
    'Y (angstrom)'                  : ((39, 46), float),
    'Z (angstrom)'                  : ((47, 54), float),
    'Occupancy'                     : ((55, 60), float),
    'Temperature factor'            : ((61, 66), float),
    'Segment identifier'            : ((73, 76), str),
    'Element symbol'                : ((77, 78), str),
    'Charge'                        : ((79, 80), str),
} # taken from PDB spec (https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html)

[docs] def parse_pdb_atom_record(pdb_atom_record : str) -> dict[str, Union[str, int, float]]: '''Extracts informations (with correct type casting) from a PDB "ATOM" or "HETATM" record''' pdb_atom_record = pdb_atom_record.ljust(80) # ensure line if padded to 80 characters long to avoid IndexErrors atom_info = {} # TODO: add error handling for poorly-formatted atom records for field_name, ((col_start, col_end), cast_type) in PDB_ATOM_RECORD_TOKENS.items(): field_value = pdb_atom_record[col_start-1:col_end].strip() # offset for 0-indexing if not field_value: # special cases for empty fields # no need to check for empty strings; these are allowed if cast_type == int: field_value = 0 if cast_type == float: field_value = 0.0 atom_info[field_name] = cast_type(field_value) return atom_info
[docs] @dataclass(frozen=True) class SerialAtomLabeller: ''' For assigning unique numbered atom names based on their order of appearance within a molecule and elemental class Useful, for example, in generating unique atom names for a PDB file Parameters ---------- atom_label_width : int , default 4 Exact length alloted for any generated atom label Labels shorter than this are right-padded with spaces, while labels longer than this are truncated Default of 4 is the chosen to be compatible with the PDB specification ("Atom name: lines 13-16, left-justified") https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html include_elem_idx : bool, default True Whether to attach a numerical element-index postfix to atom labels E.g. with atom_label_width=4, the fifth carbon in a topology will be labelled as "C004" with include_elem_idx=True, while labelled as "C " with include_elem_idx=False, default_elem_idx : int, default 0 Starting index for each element category By default, is 0-indexed; MUST BE POSITIVE ''' atom_label_width : int = 4 include_elem_idx : bool = True default_elem_idx : int = 0 element_counter : Counter = field(init=False, default_factory=Counter) def __post_init__(self) -> None: '''Check ranges on input values''' if self.atom_label_width < 0: raise ValueError(f'Must provide a non-negative number of index digits to include (provided {self.atom_label_width})') if self.default_elem_idx < 0: raise ValueError(f'Must provide a non-negative starting index for element indices (provided {self.default_elem_idx})')
[docs] def get_atom_label(self, elem_symbol : str) -> str: ''' Obtain a numbered atom label for an atom based on its element, updating the underlying element context in the process ''' if not isinstance(elem_symbol, str): raise TypeError(f'Must pass symbol of atom\'s element as str (not type {type(elem_symbol).__name__})') if elem_symbol not in self.element_counter: # initialize first occurence to starting value self.element_counter[elem_symbol] = self.default_elem_idx atom_idx_label : str = '' if self.include_elem_idx: atom_idx = self.element_counter[elem_symbol] num_idx_digits = max(self.atom_label_width - len(elem_symbol), 0) # number of symbols left over for an atom index atom_idx_label = f'{atom_idx:0{num_idx_digits}d}' atom_name = f'{elem_symbol}{atom_idx_label}' atom_name = atom_name.ljust(self.atom_label_width, ' ')[:self.atom_label_width] # pad with spaces if too short, or truncate if too long assert(len(atom_name) <= self.atom_label_width) # perfunctory check to make sure things are working as expected self.element_counter[elem_symbol] += 1 # update tally with addition of new occurence of a particular element return atom_name