Skip to content

molecular_synthesis

genlm.eval.domains.molecular_synthesis

MolecularSynthesisInstance

Bases: Instance

Schema for molecular synthesis instance.

Source code in genlm/eval/domains/molecular_synthesis.py
class MolecularSynthesisInstance(Instance):
    """Schema for molecular synthesis instance."""

    molecules: list[str]

MolecularSynthesisDataset

Bases: Dataset[MolecularSynthesisInstance]

Dataset for molecular synthesis evaluation.

Source code in genlm/eval/domains/molecular_synthesis.py
class MolecularSynthesisDataset(Dataset[MolecularSynthesisInstance]):
    """Dataset for molecular synthesis evaluation."""

    def __init__(self, prompt_molecules):
        """Initialize the dataset with a list of molecules.

        Args:
            prompt_molecules: List of lists of molecules which will be used to generate prompts.
        """
        self.prompt_molecules = prompt_molecules

    def __len__(self):
        return len(self.prompt_molecules)

    @classmethod
    def from_smiles(cls, smiles_path, n_molecules=20, n_instances=100, seed=1234):
        """Load molecules from a SMILES file.

        Args:
            smiles_path (str): Path to the .smi file containing SMILES strings.
            n_molecules (int): Number of molecules to sample.
            n_instances (int): Number of instances to sample.
            seed (int): Seed for the random number generator.

        Returns:
            MolecularSynthesisDataset: Dataset initialized with molecules from the SMILES.
        """
        molecules = open(smiles_path).readlines()
        prompt_molecules = []
        random.seed(seed)
        for _ in range(n_instances):
            molecule_ids = random.sample(range(len(molecules)), n_molecules)
            prompt_molecules.append([molecules[i] for i in molecule_ids])
        return cls(prompt_molecules)

    def __iter__(self):
        """Iterate over molecules.

        Returns:
            Iterator[MolecularSynthesisInstance]: Iterator over molecular synthesis instances.
        """
        for i, molecules in enumerate(self.prompt_molecules):
            yield MolecularSynthesisInstance(molecules=molecules, instance_id=i)

    @property
    def schema(self):
        """Get the schema class for this dataset.

        Returns:
            type[MolecularSynthesisInstance]: The Pydantic model class for molecular synthesis instances.
        """
        return MolecularSynthesisInstance

__init__(prompt_molecules)

Initialize the dataset with a list of molecules.

Parameters:

Name Type Description Default
prompt_molecules

List of lists of molecules which will be used to generate prompts.

required
Source code in genlm/eval/domains/molecular_synthesis.py
def __init__(self, prompt_molecules):
    """Initialize the dataset with a list of molecules.

    Args:
        prompt_molecules: List of lists of molecules which will be used to generate prompts.
    """
    self.prompt_molecules = prompt_molecules

from_smiles(smiles_path, n_molecules=20, n_instances=100, seed=1234) classmethod

Load molecules from a SMILES file.

Parameters:

Name Type Description Default
smiles_path str

Path to the .smi file containing SMILES strings.

required
n_molecules int

Number of molecules to sample.

20
n_instances int

Number of instances to sample.

100
seed int

Seed for the random number generator.

1234

Returns:

Name Type Description
MolecularSynthesisDataset

Dataset initialized with molecules from the SMILES.

Source code in genlm/eval/domains/molecular_synthesis.py
@classmethod
def from_smiles(cls, smiles_path, n_molecules=20, n_instances=100, seed=1234):
    """Load molecules from a SMILES file.

    Args:
        smiles_path (str): Path to the .smi file containing SMILES strings.
        n_molecules (int): Number of molecules to sample.
        n_instances (int): Number of instances to sample.
        seed (int): Seed for the random number generator.

    Returns:
        MolecularSynthesisDataset: Dataset initialized with molecules from the SMILES.
    """
    molecules = open(smiles_path).readlines()
    prompt_molecules = []
    random.seed(seed)
    for _ in range(n_instances):
        molecule_ids = random.sample(range(len(molecules)), n_molecules)
        prompt_molecules.append([molecules[i] for i in molecule_ids])
    return cls(prompt_molecules)

__iter__()

Iterate over molecules.

Returns:

Type Description

Iterator[MolecularSynthesisInstance]: Iterator over molecular synthesis instances.

Source code in genlm/eval/domains/molecular_synthesis.py
def __iter__(self):
    """Iterate over molecules.

    Returns:
        Iterator[MolecularSynthesisInstance]: Iterator over molecular synthesis instances.
    """
    for i, molecules in enumerate(self.prompt_molecules):
        yield MolecularSynthesisInstance(molecules=molecules, instance_id=i)

schema property

Get the schema class for this dataset.

Returns:

Type Description

type[MolecularSynthesisInstance]: The Pydantic model class for molecular synthesis instances.

MolecularSynthesisEvaluator

Bases: Evaluator[MolecularSynthesisInstance]

Evaluator for molecular synthesis.

Source code in genlm/eval/domains/molecular_synthesis.py
class MolecularSynthesisEvaluator(Evaluator[MolecularSynthesisInstance]):
    """Evaluator for molecular synthesis."""

    def evaluate_sample(self, instance, response):
        """Evaluate if a response matches the regex pattern.

        Args:
            instance (PatternMatchingInstance): The pattern matching instance being evaluated.
            response (str): The model's response text.

        Returns:
            (bool): Whether the response matches the pattern.
        """
        valid, acc = cached_eval(response.strip())
        desc = "valid" if valid else "invalid"
        return EvaluationResult(score=acc, desc=desc)

evaluate_sample(instance, response)

Evaluate if a response matches the regex pattern.

Parameters:

Name Type Description Default
instance PatternMatchingInstance

The pattern matching instance being evaluated.

required
response str

The model's response text.

required

Returns:

Type Description
bool

Whether the response matches the pattern.

Source code in genlm/eval/domains/molecular_synthesis.py
def evaluate_sample(self, instance, response):
    """Evaluate if a response matches the regex pattern.

    Args:
        instance (PatternMatchingInstance): The pattern matching instance being evaluated.
        response (str): The model's response text.

    Returns:
        (bool): Whether the response matches the pattern.
    """
    valid, acc = cached_eval(response.strip())
    desc = "valid" if valid else "invalid"
    return EvaluationResult(score=acc, desc=desc)

default_prompt_formatter(tokenizer, instance, use_chat_format=False, system_prompt=SYSTEM_PROMPT)

Default prompt formatter for molecular synthesis.

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use.

required
instance MolecularSynthesisInstance

The instance to format.

required
use_chat_format bool

Whether to use chat format.

False
system_prompt str

The system prompt to use.

SYSTEM_PROMPT

Returns:

Type Description
list[int]

The prompt ids.

Source code in genlm/eval/domains/molecular_synthesis.py
def default_prompt_formatter(
    tokenizer,
    instance,
    use_chat_format=False,
    system_prompt=SYSTEM_PROMPT,
):
    """Default prompt formatter for molecular synthesis.

    Args:
        tokenizer (Tokenizer): The tokenizer to use.
        instance (MolecularSynthesisInstance): The instance to format.
        use_chat_format (bool): Whether to use chat format.
        system_prompt (str): The system prompt to use.

    Returns:
        (list[int]): The prompt ids.
    """
    if use_chat_format:
        raise NotImplementedError(
            "Chat format not implemented for molecular synthesis."
        )
    else:
        prompt_ids = tokenizer.encode(
            system_prompt
            + "\n"
            + "\n".join("Molecule: " + x for x in instance.molecules)
            + "\nMolecule:"
        )

    return prompt_ids