Source code for polymerist.polymers.monomers.repr

'''For representing monomer information'''

__author__ = 'Timotej Bernat'
__email__ = 'timotej.bernat@colorado.edu'

import logging
LOGGER = logging.getLogger(__name__)

from typing import Generator, Optional, Iterable, Union
from dataclasses import dataclass, field

from itertools import cycle
from collections import defaultdict

from rdkit import Chem

from .specification import compliant_mol_SMARTS
from ...genutils.fileutils.jsonio.jsonify import make_jsonifiable
from ...smileslib import Smarts, InvalidSMARTS, is_valid_SMARTS
from ...rdutils.bonding.portlib import get_num_ports


# MAIN REPRESENTATION CLASS
[docs] @make_jsonifiable @dataclass class MonomerGroup: '''Stores collections of residue-labelled monomer SMARTS''' monomers : dict[str, Union[Smarts, list[Smarts]]] = field(default_factory=dict) term_orient : dict[str, str] = field(default_factory=dict) # keys are either "head" or "tail", values are the names of residues in "monomers" # MONOMER ADDITION AND VALIDATION def __post_init__(self) -> None: # Encase bare SMARTS into lists and check that all monomer SMARTS are valid monomers_init = self.monomers # store inputted values self.monomers = {} # clear monomers and re-add one-at-a-time for resname, smarts in monomers_init.items(): self.add_monomer(resname, smarts) # DEV: opted to forgo term_orient check for now, as modifying this violates the read-only data model aimed for here def _add_monomer(self, resname : str, smarts : Smarts) -> None: '''Add a new monomer to the templates already stored within, subject to validation checks''' if not isinstance(smarts, str): raise TypeError(f'Values of monomers must be either SMARTS strings or lists of SMARTS strings, not "{type(smarts).__name__}"') # DEV: include check for empty string? (technically still a valid SMARTS string, but a pretty pathological one at that) if not is_valid_SMARTS(smarts): raise InvalidSMARTS(f'Provided invalid monomer SMARTS string for {resname}: "{smarts}"') smarts = compliant_mol_SMARTS(smarts) # enforce compliance with monomer template SMARTS specification - DEVNOTE: double-verifies valid_SMARTS (not bad, just redundant) if resname in self.monomers: existing_resgroup = self.monomers[resname] if isinstance(existing_resgroup, list) and (smarts not in existing_resgroup): LOGGER.debug(f'Extending existing residue category "{resname}" with SMARTS {smarts}') self.monomers[resname].append(smarts) else: LOGGER.debug(f'Creating new residue category "{resname}", containing singular SMARTS ["{smarts}"])') self.monomers[resname] = [smarts] def _add_monomers(self, resname : str, smarts_container : Iterable[Smarts]) -> None: '''Add new monomers to the templates already stored within, subject to validation checks, from an iterable container''' for smarts in smarts_container: self._add_monomer(resname, smarts)
[docs] def add_monomer(self, resname : str, smarts : Union[Smarts, Iterable[Smarts]]) -> None: '''Register new monomers, either directly from SMARTS or from a container of SMARTS''' if isinstance(smarts, Iterable) and not isinstance(smarts, str): # don't want to insert one character at a time if a string is in fact provided self._add_monomers(resname, smarts) else: self._add_monomer(resname, smarts) # assume any other inputs are singular values or strings
# DUNDER "MAGIC" METHODS def __getitem__(self, resname : str) -> str: '''Convenience method to access .monomers directly from instance''' return self.monomers[resname] # NOTE: deliberately avoid "get()" here to propagate KeyError # BUG: user can directly append to the returned value to forgo monomer validation checks; # this is not unit to __getitem__ but rather a consequence of thinly-wrapping builtin types def __setitem__(self, resname : str, smarts : Smarts) -> str: '''Convenience method to access .monomers directly from instance''' self.add_monomer(resname, smarts) def __hash__(self) -> int: '''Hash based on monomer SMARTS and terminal orientation in a canonical order''' # TOSELF: this is far from bulletproof, viz. canonicalzation of SMARTS, list value sorting, etc return hash(f'{sorted(self.monomers.items())}{sorted(self.term_orient.items())}') # ATTRIBUTE PROPERTIES AND ALIASES
[docs] @staticmethod def is_terminal(monomer : Chem.Mol) -> bool: '''Determine whether or not a monomer is terminal''' return get_num_ports(monomer) == 1
@property def SMARTS(self) -> dict[str, list[Smarts]]: '''Alias of legacy "monomers" attribute''' return self.monomers # alias of legacy name for convenience # ITERATION OVER STORED MOLECULE FRAGMENTS
[docs] def iter_rdmols(self, term_only : Optional[bool]=None) -> Generator[tuple[str, Chem.Mol], None, None]: ''' Generate (residue name, RDKit Mol) pairs of all monomers present Simplifies iteration over internal lists of monomer Mols Can optionally filter by monomer termination: term_only=True -> only terminal monomers term_only=False -> only middle monomers term_only=None -> all monomers ''' for resname, SMARTS_list in self.monomers.items(): for SMARTS in SMARTS_list: monomer = Chem.MolFromSmarts(SMARTS) if (term_only is None) or (MonomerGroup.is_terminal(monomer) == term_only): yield (resname, monomer)
[docs] def rdmols(self, term_only : Optional[bool]=None) -> dict[str, list[Chem.Mol]]: ''' Returns dict of RDKit Mol lists keyed by residue name Can optionally filter by monomer termination: term_only=True -> only terminal monomers term_only=False -> only middle monomers term_only=None -> all monomers ''' rdmol_dict = defaultdict(list) for resname, rdmol in self.iter_rdmols(term_only=term_only): rdmol_dict[resname].append(rdmol) return rdmol_dict
[docs] def contributions(self, term_only : Optional[bool]=None) -> dict[str, list[int]]: '''Returns dict of the number of real (i.e. non-linker) atoms in each residue list''' return { resname : [mol.GetNumAtoms() - get_num_ports(mol) for mol in mol_list] for resname, mol_list in self.rdmols(term_only=term_only).items() }
@property def n_monomers(self) -> int: '''Returns number of distinct monomer templates present Distinct monomers under the same residue name are counted separately''' return sum(1 for _ in self.iter_rdmols(term_only=None)) # END GROUP DETERMINATION
[docs] def linear_end_groups(self) -> dict[str, tuple[str, Chem.Mol]]: ''' Returns head-and-tail end group residue names and Mol objects as defined by term_orient If term orient is undefined, will automatically take then first <= 2 terminal groups available to be the end groups Returns ------- end_groups : dict[str, tuple[str, Chem.Mol]] A dict whose keys are any of {'head', 'tail'} and whose values are 2-tuples of residue names and Mols for the corresponding monomer ''' if self.term_orient and set(self.term_orient.keys()) == {'head', 'tail'}: LOGGER.info(f'Using user-defined terminal group orientation {self.term_orient}') monomer_iters = { resname : cycle(smarts_list) for resname, smarts_list in self.rdmols(term_only=True).items() } # cycle handles degenerate end group case correctly return { head_or_tail : (resname, next(monomer_iters[resname])) # will raise KeyError if any of the resnames are not present for head_or_tail, resname in self.term_orient.items() } else: term_orient_auto : dict[str, Smarts] = {} end_groups_auto : dict[str, Chem.Mol] = {} for head_or_tail, (resname, rdmol) in zip(['head', 'tail'], self.iter_rdmols(term_only=True)): # zip will bottom out early if fewer than 2 terminal monomers are present term_orient_auto[head_or_tail] = resname # populate purely for logging end_groups_auto[head_or_tail] = (resname, rdmol) LOGGER.warning(f'No valid terminal monomer orientations defined, auto-assigned orientations "{term_orient_auto}"; USER SHOULD VERIFY THIS YIELDS A CHEMICALLY-VALID POLYMER!') return end_groups_auto
# COMPOSITION METHODS def __add__(self, other : 'MonomerGroup') -> 'MonomerGroup': '''Content-aware method of merging multiple sets of monomer info via the addition operator''' cls = self.__class__ if not isinstance(other, cls): raise NotImplementedError(f'Can only merge {cls.__name__} with another {cls.__name__}, not object of type {type(other)}') # TODO: figure out how to handle combination of term group orientation gracefully (ignoring for now) return MonomerGroup(monomers={**self.monomers, **other.monomers}) __radd__ = __add__ # support reverse addition # CHEMICAL INFORMATION
[docs] def is_homopolymer(self) -> bool: '''Identify if a polymer is a homopolymer (i.e. only 1 type of middle monomer)''' return (len(self.rdmols(term_only=False)) == 1) # by definition, a homopolymer only has 1 unique class of middle monomer
# GRAPH INFORMATION @property def is_branchable(self) -> bool: '''Whether it is possible to generate a branched polymer from this set of monomers''' return any( get_num_ports(monomer) > 2 for (resname, monomer) in self.iter_rdmols(term_only=None) ) @property def is_linear(self) -> bool: '''Whether a group of monomers can ONLY be assembled into a linear chain''' return not self.is_branchable @property def is_linear_homopolymer(self) -> bool: '''Identify if a polymer is a linear homopolymer''' return self.is_linear and self.is_homopolymer @property def num_mid_and_term(self) -> tuple[int, int]: '''Counts of how many of the monomers are middle vs terminal, respectively''' group_counts = [0, 0] for (resname, monomer) in self.iter_rdmols(term_only=None): # TODO : consider reimplementing using new term group filtering option group_counts[self.is_terminal(monomer)] += 1 # index by bool return tuple(group_counts) # convert to tuple