Source code for polymerist.polymers.monomers.specification

'''Implementations of the canonical monomer substructure SMARTS specification defined in https://doi.org/10.26434/chemrxiv-2023-f2zxd-v2'''

__author__ = 'Timotej Bernat'
__email__ = 'timotej.bernat@colorado.edu'

import logging
LOGGER = logging.getLogger(__name__)

import re
from typing import Union

from rdkit import Chem
from rdkit.Chem import QueryAtom

from ...smileslib.cleanup import (
    Smiles, 
    Smarts,
    is_valid_SMARTS,
    InvalidSMARTS,
    # DEVNOTE: DON'T TOUCH! - expanded_SMILES import remains here purely for 
    # backwards-compatibility, as expanded_SMILES was originally defined in this module
    expanded_SMILES, 
)
from ...smileslib.primitives import RDKIT_QUERYBONDS_BY_BONDTYPE
from ...rdutils.chemlabel import (
    has_fully_mapped_atoms,
    has_uniquely_mapped_atoms,
    assign_ordered_atom_map_nums,
    mol_to_smiles_and_atom_permutation,
)


# REGEX TEMPLATES FOR COMPLIANT SMARTS
COMPLIANT_ATOM_SMARTS = re.compile( # pattern for specification-compliant query
    r'''    
    \[                              ## start atom entry: open bracket
    (?P<isotope>\d?)                ## isotope label: (possibly no) digits
    \#(?P<atomic_num>\d+?)          ## atomic number: literal octothorpe followed by at least 1 digit (non-greedy)
    (D(?P<degree>\d{1}))?           ## degree label: literal D (for number of explicit connections in SMARTS) and exactly one digit
    ([+-](?P<formal_charge>\d+))    ## formal charge: explicit sign (+ or -) followed by at least one digit
    :(?P<atom_map_num>\d+?)         ## atom map number: colon followed by at least one digit (empty not allowed, since explicitly-mapped atoms are reuiqred by the spec)
    \]                              ## end atom entry: close bracket
    ''',
    flags=re.VERBOSE,
)

ABERRANT_RDKIT_ATOM_SMARTS = re.compile( # pattern for a specification-compliant query which has been mangled by RDKit
    r'''
    \[                              ## start atom entry: open bracket
    \#(?P<atomic_num>\d+?)          ## atomic number: literal octothorpe followed by at least 1 digit (non-greedy)
    (&(?P<isotope>\d+?)\*)?         ## isotope label (totally optional): ampersand delimiter, one or more digits, and a literal asterisk (which RDKit for some reason inserts)
    (&D(?P<degree>\d{1}))?          ## degree label  (totally optional): ampersand delimiter, literal D (for number of explicit connections in SMARTS) and exactly one digit
    (&                              ## formal charge (totally optional): ampersand delimiter, followed by...
        (?P<charge_sign>[+-])           ### a charge sign (non-optional + or -)
        (?P<charge_magnitude>\d?)       ### a charge magnitude, as some number of digits (possibly none is valid, e.g. "+" is understood to mean "+1")
    )?  
    :(?P<atom_map_num>\d+?)         ## atom map number: colon followed by at least one digit
    \]                              ## end atom entry: close bracket
    ''',
    flags=re.VERBOSE,
)

[docs] def disambiguate_formal_charge(sign : str, magnitude : str) -> int: ''' Convert a (possibly implicit) sign and magnitude of a SMILES-compliant atom formal charge entry to an explicit signed integer value (e.g. "+" -> +1, "--" -> -2) Parameters ---------- sign : str ("", "+", or "-" The string (possibly empty) representing the sign of the formal charge magnitude : str ("" or digit string) The string (possibly empty) representing the magnitude of the formal charge (digits only) Returns ------- formal_charge : int The explicit signed integer value of the formal charge Will raise ValueError if sign and magnitude passed cannot be coerced into the appropriate types ''' if not magnitude: if not sign: sign, magnitude = '+0' # special case for when the field is totally blank (atoms considered neutral by default) else: magnitude = '1' return int(f'{sign}{magnitude}') # DEVNOTE: worth checking if magnitude is digit here? will be rejected by int conversion anyway if not
[docs] def chem_info_from_match(match : re.Match) -> dict[str, Union[int, str, None]]: '''Generate chemical information dict (with proper types) from an atom query regex match''' atom_info = match.groupdict() atom_info['formal_charge'] = disambiguate_formal_charge( sign=atom_info.pop('charge_sign'), # raise Exception f ancilliary data is not present, or else look up and remove it to avoid danling keys when passed as args magnitude=atom_info.pop('charge_magnitude'), # raise Exception f ancilliary data is not present, or else look up and remove it to avoid danling keys when passed as args ) for key, value in atom_info.items(): if isinstance(value, str) and value.isdigit(): atom_info[key] = int(value) # convert parsed strings to ints where possible return atom_info
# SMARTS ATOM QUERY GENERATION
[docs] def compliant_atom_query_from_info( atomic_num : int, degree : int, atom_map_num : int, formal_charge : int=0, isotope : int=0, as_atom : bool=False ) -> Union[str, QueryAtom]: '''Construct a monomer-spec compliant atom SMARTS string directly from chemical information''' if not isotope: # handles when isotope is literal 0 or NoneType isotope = "" # non-specific isotope is not explicitly written in string (left empty) if atomic_num == 0: # define dummy/wild atoms to be linkers # NOTE: to simplify info fetching from regex, require that dummy atoms be explicitly labelled as atomic number 0 (rather than "*") # atom_query = f'[{isotope}*:{atom_map_num}]' atom_query = f'[{isotope}#{atomic_num}:{atom_map_num}]' else: atom_query = f'[{isotope}#{atomic_num}D{degree}{formal_charge:+}:{atom_map_num}]' if as_atom: return Chem.AtomFromSmarts(atom_query) return atom_query
[docs] def compliant_atom_query_from_rdatom(rdatom : Chem.Atom, as_atom : bool=False) -> Union[str, QueryAtom]: '''Construct a monomer-spec compliant atom SMARTS string from an RDKit Atom''' return compliant_atom_query_from_info( atomic_num = rdatom.GetAtomicNum(), degree = rdatom.GetDegree(), # counts number of active bonds atom_map_num = rdatom.GetAtomMapNum(), # TODO : add check for nonzero map num formal_charge= rdatom.GetFormalCharge(), isotope = rdatom.GetIsotope(), as_atom=as_atom )
[docs] def compliant_atom_query_from_re_match(match : re.Match) -> str: '''Construct a monomer-spec compliant atom SMARTS string from a RegEx string match of a compliant or aberrant atom''' return compliant_atom_query_from_info(**chem_info_from_match(match), as_atom=False)
# CONVERSION METHODS
[docs] def compliant_mol_SMARTS(smarts : Union[Smiles, Smarts]) -> str: ''' Convert a generic SMARTS string into a monomer specification-compliant one For details on specification, see https://doi-org.colorado.idm.oclc.org/10.1021/acs.jcim.3c01691 Parameters ---------- smarts : Union[Smiles, Smarts] The SMARTS (or, by virtue of superset, SMILES) string to convert Returns ------- compliant_smarts : Smarts The structurally-correspondent monomer specification-compliant SMARTS string ''' # initialize Mol object from passed SMARTS if not is_valid_SMARTS(smarts): raise InvalidSMARTS(f'SMARTS string "{smarts}" cannot be interpreted as a valid SMARTS pattern') # downconvert from general SMARTS to SMILES for structure expansion (cannot be done on QueryMols) ## DEVNOTE: I know these SMILES/SMARTS conversions look superfluous, but they are absolutely necessary ## The cast from SMARTS to SMILES allows this function to digest SMARTS while also allowing structural operations ## (e.g. adding Hs to the intermediate mol), which otherwise wouldn't make sense on a naive SMARTS-based query Mol struct_smiles, atom_order = mol_to_smiles_and_atom_permutation( Chem.MolFromSmarts(smarts, mergeHs=False), allBondsExplicit=True, allHsExplicit=False, # kekuleSmiles=True, kekuleSmiles=False, canonical=False, ) ## in general, RDKit scrambled the order of atoms when writing to SMILES, so we un-scramble them both to ## respect the user's preference for atom order AND as a necessary condition for this function to be idempotent struct_mol = Chem.RenumberAtoms( Chem.MolFromSmiles(struct_smiles, sanitize=False), newOrder=atom_order, ) # make chemical info in atoms and bonds totally explicit struct_mol.UpdatePropertyCache() # required to set valence for explicit Hs check without otherwise performing sanitization struct_mol = Chem.AddHs(struct_mol, addCoords=False) Chem.Kekulize(struct_mol, clearAromaticFlags=True) # force kekulization (required by spec) Chem.SanitizeMol(struct_mol)#, sanitizeOps=Chem.SANITIZE_ALL & ~Chem.SANITIZE_ADJUSTHS & ~Chem.SANITIZE_KEKULIZE & ~Chem.SANITIZE_SETAROMATICITY) # ensure sanitization does not undo kekulization above # ensure all atoms are uniquely mapped (also required by spec); don;t overwrite perfectly-valid existing mapping if present # DEVNOTE: this MUST be done AFTER addition of Hs, so new unmapped H atoms don't get inserted after map number assignment if not (has_fully_mapped_atoms(struct_mol) and has_uniquely_mapped_atoms(struct_mol)): assign_ordered_atom_map_nums(struct_mol, start_from=1, in_place=True) # assign query info to atoms and bonds query_mol = Chem.MolFromSmarts( Chem.MolToSmiles(struct_mol, kekuleSmiles=True, allBondsExplicit=True, allHsExplicit=True), mergeHs=False, ) for atom in query_mol.GetAtoms(): atom_query = compliant_atom_query_from_rdatom(atom, as_atom=True) atom.SetQuery(atom_query) for bond in query_mol.GetBonds(): bond_query = RDKIT_QUERYBONDS_BY_BONDTYPE[bond.GetBondType()] bond.SetQuery(bond_query) unsanitized_smarts = Chem.MolToSmarts(query_mol) # RDKit does some mangling of queries which needs to be corrected here # sanitize away RDKit artifacts sanitized_smarts, num_repl = re.subn( pattern=ABERRANT_RDKIT_ATOM_SMARTS, repl=compliant_atom_query_from_re_match, string=unsanitized_smarts, count=query_mol.GetNumAtoms() # can't possibly replace more queries than there are atoms ) if num_repl > 0: LOGGER.debug(f'Cleaned {num_repl} SMARTS atom query aberrations introduced by RDKit') sanitized_smarts = sanitized_smarts.replace('#0', '*') # replace explicit atom number 0 calls with star (easier to do post-processing, as #0 is easier to implement) return sanitized_smarts