Source code for polymerist.smileslib.chemdbqueries

'''For querying chemical databases for information about molecules specified by SMILES string and other structures'''

__author__ = 'Timotej Bernat'
__email__ = 'timotej.bernat@colorado.edu'

import logging
LOGGER  = logging.getLogger(__name__)

from typing import Any, ClassVar, Container, Iterable, Optional, Sequence
from abc import ABC, abstractmethod

import requests
from requests.structures import CaseInsensitiveDict

from ..genutils.decorators.classmod import register_abstract_class_attrs, register_subclasses
from ..genutils.importutils.dependencies import requires_modules, MissingPrerequisitePackage


# CUSTOM EXCEPTIONS
[docs] class InvalidPropertyError(Exception): '''Raised when attempting to query a property that a chemical database service cannot provide''' pass
[docs] class NullPropertyResponse(Exception): '''Raised when a chemical database query doesn't fail BUT returns a NoneType where not allowed''' pass
[docs] class ChemicalDataQueryFailed(Exception): '''Raised when a chemical data query is unfulfilled by a service''' pass
# STRATEGIES BASE FOR QUERYING CHEMICAL DATA
[docs] @register_subclasses(key_attr='service_name') @register_abstract_class_attrs('service_name') class ChemDBServiceQueryStrategy(ABC): '''Implementation of queries from a particular chemical database''' @abstractmethod def _get_property(self, property_name : str, identifier : str, **kwargs) -> Optional[Any]: ...
[docs] @classmethod def dependencies(cls) -> Iterable[str]: '''For internals, allows dynamic checking for package dependencies (useful for automating unit test boilerplate)''' ...
[docs] @classmethod @abstractmethod def is_online(cls) -> bool: '''Check if the service being queried is online and can accept requests''' ...
[docs] @classmethod @abstractmethod def queryable_properties(cls) -> Container[str]: '''List which chemical property names can be queried from the service''' ...
[docs] @classmethod @abstractmethod def queryable_namespaces(cls) -> Container[str]: '''List which chemical identification types can be searched through by the service''' ...
[docs] def validate_property(self, property_name : str) -> None: '''Pre-check to ensure that a property is queryable from a service before attempting HTTP query''' if property_name not in self.queryable_properties(): prop_options_str = '\n'.join(sorted(self.queryable_properties())) prop_error_msg = f'Cannot query property "{property_name}" from {self.service_name}' LOGGER.error(prop_error_msg) # log briefer error message in cases where the ensuing ValueError is bypassed raise InvalidPropertyError(f'{prop_error_msg};\nChoose from one of the following property names:\n{prop_options_str}')
[docs] def get_property( self, property_name : str, identifier : str, namespace : Optional[str], keep_first_only : bool=True, allow_null_return : bool=False, **kwargs ) -> Optional[Any]: '''Fetch a property associated with a molecule from a chemical database query service''' property_name = property_name.casefold() # avoid needing to account for case-sensitivity in property name check LOGGER.info(f'Sent query request for property "{property_name}" to {self.service_name}') self.validate_property(property_name=property_name) prop_val = self._get_property(property_name=property_name, identifier=identifier, namespace=namespace, **kwargs) if not prop_val: prop_val = None # cast empty lists, strings, etc to NoneType if isinstance(prop_val, Container) and not isinstance(prop_val, str) and keep_first_only: # avoid bug where first char of string response is returned prop_val = prop_val[0] if (prop_val is None) and (not allow_null_return): # NOTE: duplicated NoneType check is needed to catch empty containers which are cast to None above null_error_msg = f'{self.service_name} returned NoneType "{property_name}", which is declared invalid by call signature' LOGGER.error(null_error_msg) raise NullPropertyResponse(null_error_msg) LOGGER.info(f'Successfully received property "{property_name}" from {self.service_name}') return prop_val
# CONCRETE IMPLEMENTATIONS OF CHEMICAL DATABASE SERVICE QUERIES ## NIH CACTUS cirpy_error = MissingPrerequisitePackage( importing_package_name=__spec__.name, use_case='Querying the NIH CACTUS Chemical Identifier Resolver (CIR)', install_link='https://cirpy.readthedocs.io/en/latest/guide/install.html', dependency_name='cirpy', dependency_name_formal='CIRpy', )
[docs] class NIHCACTUSQueryStrategy(ChemDBServiceQueryStrategy): ''' Implementation of chemical query requests to the NIH's CADD group Cheminformatics Tools and User Services (CACTUS) Chemical Identifier Resolver (CIR) ''' service_name : ClassVar[str] = 'NIH CACTUS CIR'
[docs] @classmethod def dependencies(cls): return ['cirpy']
[docs] @classmethod @requires_modules('cirpy', missing_module_error=cirpy_error) def queryable_properties(cls) -> set[str]: import cirpy _CIR_PROPS = { # see official docs for more info: https://cactus.nci.nih.gov/chemical/structure_documentation # Chemical Representations 'smiles', 'names', 'iupac_name', 'formula', 'sdf', ## NIH NCI Identifiers 'ficts', 'ficus', 'uuuuu', 'hashisy', ## InChI-related properties with aliases 'inchi', 'inchikey', 'stdinchi', 'stdinchikey', ## Other formats 'cas', 'chemspider_id', # Chemical information 'mw', 'h_bond_donor_count', 'h_bond_acceptor_count', 'h_bond_center_count', 'rule_of_5_violation_count', 'rotor_count', 'effective_rotor_count', 'ring_count', 'ringsys_count', ## these were not documented on CACTUS or by cirpy, but scraped from webchem: https://github.com/ropensci/webchem/blob/master/R/cir.R#L168-L174 'deprotonable_group_count', 'heavy_atom_count', 'heteroatom_count', 'hydrogen_atom_count', 'monoisotopic_mass', 'protonable_group_count', 'xlogp2', # Documented but non-working queries # 'image', # for some reason, image query returns internal server error in testing # 'twirl', # NOTE: this is documented on the CIR site, but raises XML error in practice } return _CIR_PROPS | cirpy.FILE_FORMATS # see here for file formats: https://cirpy.readthedocs.io/en/latest/guide/gettingstarted.html#file-formats
[docs] @classmethod def is_online(cls): response = requests.head('https://cactus.nci.nih.gov/chemical/structure') return response.status_code < 500 # NOTE: could also be more stringent and check == 200 for OK; enough to just check server-side error for now
[docs] @classmethod def queryable_namespaces(cls) -> set[str]: return { # obtained from https://cirpy.readthedocs.io/en/latest/guide/resolvers.html 'smiles', 'stdinchikey', 'stdinchi', 'ncicadd_identifier', # (for FICTS, FICuS, uuuuu) 'hashisy', 'cas_number', 'name', # this is not documented but DOES work 'name_by_opsin', 'name_by_cir', }
@requires_modules('cirpy', missing_module_error=cirpy_error) def _get_property(self, property_name : str, identifier : str, namespace : Optional[str]=None, **kwargs): import cirpy return cirpy.resolve( input=identifier, representation=property_name, resolvers=[namespace], **kwargs, )
## PubChem pubchempy_error = MissingPrerequisitePackage( importing_package_name=__spec__.name, use_case='Querying the PubChem Compound database', install_link='https://pubchempy.readthedocs.io/en/latest/guide/install.html', dependency_name='pubchempy', dependency_name_formal='PubChemPy', )
[docs] class PubChemQueryStrategy(ChemDBServiceQueryStrategy): ''' Implementation of chemical query requests to PubChem via the PUG REST API (https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest) ''' service_name : ClassVar[str] = 'PubChem'
[docs] @classmethod def dependencies(cls): return ['pubchempy']
[docs] @classmethod def is_online(cls): response = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/aspirin/property/IUPACName/TXT') # sample query which is well-formatted return response.status_code < 500 # NOTE: enough to just check server-side error for now, but could be more stringent and check if ==200
[docs] @classmethod @requires_modules('pubchempy', missing_module_error=pubchempy_error) def queryable_properties(cls) -> set[str]: from pubchempy import PROPERTY_MAP return set.union( set(PROPERTY_MAP.keys()), set(PROPERTY_MAP.values()), set(proper_name.casefold() for proper_name in PROPERTY_MAP.values()), # include case-insensitive versions of names for completeness {'fingerprint2d'}, # also taken from webchem: https://github.com/ropensci/webchem/blob/master/R/pubchem.R#L377C21-L392C55 )
[docs] @classmethod def queryable_namespaces(cls) -> set[str]: return { # obtained from https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest#section=Input 'cid', 'name', 'smiles', 'inchi', 'sdf', 'inchikey', 'formula', 'listkey', }
@requires_modules('pubchempy', missing_module_error=pubchempy_error) def _get_property(self, property_name : str, identifier : str, namespace : Optional[str]='smiles', **kwargs) -> Optional[Any]: import pubchempy as pcp try: pubchem_result = pcp.get_properties( properties=property_name, identifier=identifier, namespace=namespace, **kwargs, ) except pcp.PubChemPyError: raise requests.HTTPError # discards some information in return for making Strategy interface oblivious to pubchempy (i.e. in case it is not installed) else: if pubchem_result: property_name_no_under = property_name.replace('_', '') # remove underscores to compatibilize naming hits (property names returned from PubChem will never contain underscores) pubchem_result = [ # extract the requested property field from the full return fields pubchempy returns case_insensitive_query_result[property_name_no_under] # extract property value from extraneous CID (and any other) info for case_insensitive_query_result in map(CaseInsensitiveDict, pubchem_result) # allows case-insensitive matching to query names if property_name_no_under in case_insensitive_query_result # skip if return doesn't contain the info we specifically requested (happens occasionally for some reason) ] return pubchem_result
# UTILITY FUNCTIONS EMPLOYING GENERIC STRATEG(Y/IES)
[docs] def get_chemical_property( property_name : str, identifier : str, namespace : str='smiles', keep_first_only : bool=True, allow_null_return : bool=False, fail_quietly : bool=False, services : Optional[Sequence['ChemDBServiceQueryStrategy']]=None, **kwargs, ) -> Optional[Any]: '''Attempt to fetch a molecular property from a variety of chemical database services, either provided manually (in the order they should be checked) or ALL implemented service queries by default Will return the first valid returned result or, if all services fail, raise Exception ''' # determine services which should be queried if services is None: services = [chem_query_strat_type() for chem_query_strat_type in ChemDBServiceQueryStrategy.subclass_registry.values()] if not services: # check if "services" turns out to be an empty collection (either as-passed or because no subclasses are implemented when defaulting) raise IndexError('Must provide at least one chemical database querying strategy to "services"') n_services_to_try : int = len(services) # query services sequentially in order of appearance for i, service in enumerate(services, start=1): ## validate type of service strategies if isinstance(service, type): service = service() # allows ChemDBServiceQueryStrategy types to be passed in lieu of instances if not isinstance(service, ChemDBServiceQueryStrategy): raise TypeError(f'Services must be specified as {ChemDBServiceQueryStrategy.__name__} instances, not objects of type {type(service.__name)}') ## attempt to query result from service LOGGER.info(f'Attempting chemical property query to service {i}/{n_services_to_try} ("{service.service_name}"):') try: prop_val = service.get_property( property_name, identifier, namespace, keep_first_only=keep_first_only, allow_null_return=allow_null_return, **kwargs, ) return prop_val except requests.HTTPError: LOGGER.error(f'Query to {service.service_name} failed, either due to connection timeout or invalid request') continue except (InvalidPropertyError, NullPropertyResponse): # skip over invalid property names (keep trying other services rather than failing) # log messages baken in to respective raises for these custom exceptions continue else: # take action when None of the provided services turn up fruitful fail_msg = 'Query could not be fulfilled by any of the provided chemical query services' if fail_quietly: LOGGER.error(f'{fail_msg}; returning NoneType') return None else: # fail vocally if none of the services can fulfill the property request raise ChemicalDataQueryFailed(fail_msg)