Source code for polymerist.smileslib.functgroups._daylight_scrape

'''Backend web-scraping to (re)build SMARTS lookup table from the Daylight SMARTS official site'''

__author__ = 'Timotej Bernat'
__email__ = 'timotej.bernat@colorado.edu'

from dataclasses import dataclass

import requests
from bs4 import BeautifulSoup

import pandas as pd


[docs] @dataclass(frozen=True) class FnGroupSMARTSEntry: '''For encapuslating SMARTS group info from Daylight SMARTS registry''' category : str category_desc : str group_type : str group_name : str SMARTS : str SMARTS_desc : str
DAYLIGHT_URL = 'https://www.daylight.com/dayhtml_tutorials/languages/smarts/smarts_examples.html'
[docs] def scrape_SMARTS(url : str=DAYLIGHT_URL) -> pd.DataFrame: '''Scrape SMARTS strings and accompanying descriptions and categories off of Daylight SMARTS official site''' soup = BeautifulSoup(requests.get(url).content, 'html.parser') entries = set() for desc_list in soup.find_all('dl'): category = desc_list.find_previous('a')['name'] category_desc = desc_list.find_previous('h2').text group_type = desc_list.find_previous('h3').text for desc_term in desc_list.find_all('dt')[::-1]: # deal with annoying nesting by iterating over innermost terms first (i.e. in reverse) term = desc_term.extract() # remove tag from tree to prevent it from occurring in duplicate in higher terms text = list(term.stripped_strings) if len(text) == 3: group_name, SMARTS, SMARTS_desc = term.stripped_strings elif len(text) == 2: # raised when attempting to unpack with wrong number of args when no description was provided group_name, SMARTS, SMARTS_desc = *term.stripped_strings, '' else: pass # print(len(text), any('example' in w.lower() for w in text), text) entry = FnGroupSMARTSEntry(category, category_desc, group_type, group_name, SMARTS, SMARTS_desc) entries.add(entry) return pd.DataFrame.from_records( entry.__dict__ for entry in entries )