Source code for polymerist.smileslib.functgroups._daylight_scrape

'''Backend web-scraping to (re)build SMARTS lookup table from the Daylight SMARTS official site'''

__author__ = 'Timotej Bernat'
__email__ = 'timotej.bernat@colorado.edu'

from dataclasses import dataclass

import requests
from bs4 import BeautifulSoup

import pandas as pd



[docs]
@dataclass(frozen=True)
class FnGroupSMARTSEntry:
    '''For encapuslating SMARTS group info from Daylight SMARTS registry'''
    category   : str
    category_desc : str

    group_type : str
    group_name : str

    SMARTS : str
    SMARTS_desc : str


DAYLIGHT_URL = 'https://www.daylight.com/dayhtml_tutorials/languages/smarts/smarts_examples.html'


[docs]
def scrape_SMARTS(url : str=DAYLIGHT_URL) -> pd.DataFrame:
    '''Scrape SMARTS strings and accompanying descriptions and categories off of Daylight SMARTS official site'''
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    entries = set()
    for desc_list in soup.find_all('dl'):
        category = desc_list.find_previous('a')['name']
        category_desc = desc_list.find_previous('h2').text

        group_type = desc_list.find_previous('h3').text
        for desc_term in desc_list.find_all('dt')[::-1]: # deal with annoying nesting by iterating over innermost terms first (i.e. in reverse)
            term = desc_term.extract() # remove tag from tree to prevent it from occurring in duplicate in higher terms
            text = list(term.stripped_strings)
            
            if len(text) == 3:
                group_name, SMARTS, SMARTS_desc = term.stripped_strings
            elif len(text) == 2: # raised when attempting to unpack with wrong number of args when no description was provided
                group_name, SMARTS, SMARTS_desc = *term.stripped_strings, ''
            else:
                pass
                # print(len(text), any('example' in w.lower() for w in text), text)

            entry = FnGroupSMARTSEntry(category, category_desc, group_type, group_name, SMARTS, SMARTS_desc)
            entries.add(entry)

    return pd.DataFrame.from_records(
        entry.__dict__
            for entry in entries
    )