Source code for polymerist.polymers.building.sequencing

'''For generating and manipulating sequences of symbols which correspond to monomer ordering in blocky and random copolymers'''

__author__ = 'Timotej Bernat'
__email__ = 'timotej.bernat@colorado.edu'

import logging
LOGGER = logging.getLogger(__name__)

from typing import Iterable, Optional
from dataclasses import dataclass, field, asdict

from ...genutils.textual.substrings import shortest_repeating_substring, repeat_string_to_length
from ...genutils.fileutils.jsonio.jsonify import make_jsonifiable
from ..exceptions import EndGroupDominatedChain, InsufficientChainLength, EmptyBlockSequence, PartialBlockSequence


[docs] @make_jsonifiable @dataclass class LinearCopolymerSequencer: ''' For encapsulating information about the sequence of repeat units in a periodic, linear copolymer Also covers, as trivial special cases, homopolymers and alternating copolymers Parameters ---------- sequence_kernel : str A sequence indicating a periodic ordering of monomers in a linear polymer block (e.g. "A", "ABAC", etc) Each unique symbol in the sequence corresponds to a distinct monomer in the block n_repeat_units : int The desired total number of monomers (including terminal monomers) in a polymer chain n_monomers_terminal : int The number of terminal monomers ("end groups") which are to be included in the chain in addition to the middle monomers described by "sequence" Raises ------ EmpyBlockSequence The sequence provided is empty (can't be used to define nonzero-length chain) End GroupDominatedChain The number of terminal monomers exceed the number of total monomers ''' sequence_kernel : str n_repeat_units : int n_repeat_units_terminal : int = 0 # Attribute checks and modifications def __post_init__(self) -> None: if not self.sequence_kernel: raise EmptyBlockSequence('Must provide non-empty sequence kernel to yield a valid (co)polymer sequence') if self.n_repeat_units_middle < 0: raise EndGroupDominatedChain( f'Number of terminal monomers exceeds requested chain length; ({self.n_repeat_units}-mer ' \ f'chain can\'t possibly contain {self.n_repeat_units_terminal} terminal monomers)' )
[docs] def copy(self) -> 'LinearCopolymerSequencer': '''Returns another equivalent instance of the current sequence info more efficiently than a complete deepcopy''' return self.__class__(**asdict(self))
[docs] def reduce(self) -> None: ''' Determines if there is a shorter repeating subsequence making up the current sequence kernel If there is, adjusts the sequence kernel to that minimal sequence; does nothing otherwise Reduction is idempotent, and guarantees that the smallest possible kernel is used when sequencing ''' minimal_subsequence = shortest_repeating_substring(self.sequence_kernel) kernel_period = self.block_size // len(minimal_subsequence) # account for any periodic shortening WITHIN the kernel if kernel_period == 1: LOGGER.info(f'Sequence kernel "{self.sequence_kernel}" is already fully reduced; no changes made') return else: LOGGER.info( f'Sequence kernel "{self.sequence_kernel}" can be further decomposed as {kernel_period}*"{minimal_subsequence}"; ' \ f'Setting kernel to minimal subsequence "{minimal_subsequence}"' ) self.sequence_kernel = minimal_subsequence
[docs] def reduced(self) -> 'LinearCopolymerSequencer': '''Return a sequence-reduced version of the current sequence info''' clone = self.copy() clone.reduce() return clone
# Properties derived from sequence kernel and target chain lengths @property def n_repeat_units_middle(self) -> int: '''Number of middle (i.e. non-terminal) repeat units''' return self.n_repeat_units - self.n_repeat_units_terminal # Whole sequence periods @property def block_size(self) -> int: '''Number of repeat units units in one whole iteration of the kernel block''' return len(self.sequence_kernel) period = block_size @property def n_full_periods(self) -> int: ''' Largest number of complete repetitions of the sequence kernel which, when taken together, contain no more repeats units than the specified number of middle units ''' return self.n_repeat_units_middle // self.block_size # Partial sequence residues @property def n_residual_repeat_units(self) -> int: ''' Difference between number of middle repeat units and units which would occur in maximal full periods of the kernel By construction, is no greater than the block size and is identically zero exactly when a whole number of kernel repeats ''' return self.n_repeat_units_middle % self.block_size n_residual_symbols = n_res = n_residual_repeat_units @property def has_residual(self) -> bool: ''' Whether or not the target number of middle repeat units can be attained by a whole number of kernel repeats ''' return bool(self.n_residual_repeat_units) @property def sequence_residual(self) -> str: '''Partial repeat of the kernel sequence needed to attain the speficied number of middle units''' return self.sequence_kernel[:self.n_residual_repeat_units] residual = sequence_residual ## PROCRUSTEAN sequence alignment
[docs] def procrustean_alignment(self, allow_partial_sequences : bool=False) -> tuple[str, int]: ''' PROCRUSTEAN: Periodic Recurrence Of Cyclic Repeat Unit Sequences, Truncated to an Exact and Arbitrary Number Stretches or truncates the sequence kernel to achieve a target sequence length, cycling through the kernel's period as many times as needed Algorithm produces a sequence string "P" and number of repeats "r" which, taken together, satisfy the following: - The number of units in r repeats of P plus the number of terminal monomers is precisely equal to the target number of monomers - The units in P cycle through the units in S, in the order they appear in S - The number of times S is cycled through in P is always a rational multiple of the length of S If no satisfiable sequence-count pair can be found, raises an appropriate informative exception ''' if not self.has_residual: # the case where the target length happens to consist of a whole-number of repeats of the kernel if self.n_full_periods < 1: # NOTE: if it were up to me, this would be < 0 to allow dimers, but mBuild has forced my hand raise InsufficientChainLength( f'{self.n_repeat_units}-monomer chain cannot accomodate both {self.n_repeat_units_terminal} end groups AND at least 1 middle monomer sequence' ) sequence_procrustean = self.sequence_kernel n_seq_repeats = self.n_full_periods else: if not allow_partial_sequences: raise PartialBlockSequence( f'Partial polymer block sequence required to meet target number of monomers ("{self.residual}" prefix of sequence "{self.sequence_kernel}");\n' \ 'If this is acceptable, set "allow_partial_sequences=True" and try calling build routine again' ) sequence_procrustean = repeat_string_to_length(self.sequence_kernel, target_length=self.n_repeat_units_middle, joiner='') n_seq_repeats = 1 # just repeat the entire mixed-fraction length sequence (no full sequence repeats to exploit) return sequence_procrustean, n_seq_repeats
[docs] def describe_order(self, end_group_names : Optional[Iterable[str]]=None, default_end_group_name : str='END-GROUP') -> str: '''Descriptive string presenting a condensed view of the order of repeat units in the final sequence''' # Assign names for end groups if end_group_names is None: end_group_names = [f'[{default_end_group_name}]']*self.n_repeat_units_terminal else: end_group_names = [f'[{end_group_name}]' for end_group_name in end_group_names] # unpack into list and enforce correct number of names if (num_names_provided := len(end_group_names)) != self.n_repeat_units_terminal: # DEV: consider supporting filling in missing names with default in future raise IndexError(f'Defined sequence info with {self.n_repeat_units_terminal} end groups, but only provided names for {num_names_provided}') # Insert middle omnomer parts as necessary sequence_middle = [] if self.n_full_periods != 0: ## Whole sequence strings sequence_middle.append(f'{self.n_full_periods}*[{self.sequence_kernel}]') if self.has_residual: ## Partial sequence strings sequence_middle.append(f'[{self.residual}]') # Abut with correct amount of end group indicators sequence_parts = end_group_names[:] sequence_parts[1:-1] = sequence_middle return ' + '.join(sequence_parts)
[docs] def describe_tally(self) -> str: '''Descriptive string indicating how all parts of the overall sequence contribute to the target number of repeat units''' desc_seq_counts_parts = [] if self.n_full_periods != 0: ## Whole sequence strings desc_seq_counts_parts.append(f'{self.n_full_periods} whole {self.block_size}-sequence repeat(s)') if self.has_residual: ## Partial sequence strings desc_seq_counts_parts.append(f'a partial {self.n_residual_repeat_units}/{self.block_size} sequence repeat') return ' and '.join(desc_seq_counts_parts)