molecular_synthesis

`genlm.eval.domains.molecular_synthesis`

`MolecularSynthesisInstance`

Bases: Instance

Schema for molecular synthesis instance.

Source code in genlm/eval/domains/molecular_synthesis.py

class MolecularSynthesisInstance(Instance):
    """Schema for molecular synthesis instance."""

    molecules: list[str]

`MolecularSynthesisDataset`

Bases: Dataset[MolecularSynthesisInstance]

Dataset for molecular synthesis evaluation.

Source code in genlm/eval/domains/molecular_synthesis.py

class MolecularSynthesisDataset(Dataset[MolecularSynthesisInstance]):
    """Dataset for molecular synthesis evaluation."""

    def __init__(self, prompt_molecules):
        """Initialize the dataset with a list of molecules.

        Args:
            prompt_molecules: List of lists of molecules which will be used to generate prompts.
        """
        self.prompt_molecules = prompt_molecules

    def __len__(self):
        return len(self.prompt_molecules)

    @classmethod
    def from_smiles(cls, smiles_path, n_molecules=20, n_instances=100, seed=1234):
        """Load molecules from a SMILES file.

        Args:
            smiles_path (str): Path to the .smi file containing SMILES strings.
            n_molecules (int): Number of molecules to sample.
            n_instances (int): Number of instances to sample.
            seed (int): Seed for the random number generator.

        Returns:
            MolecularSynthesisDataset: Dataset initialized with molecules from the SMILES.
        """
        molecules = open(smiles_path).readlines()
        prompt_molecules = []
        random.seed(seed)
        for _ in range(n_instances):
            molecule_ids = random.sample(range(len(molecules)), n_molecules)
            prompt_molecules.append([molecules[i] for i in molecule_ids])
        return cls(prompt_molecules)

    def __iter__(self):
        """Iterate over molecules.

        Returns:
            Iterator[MolecularSynthesisInstance]: Iterator over molecular synthesis instances.
        """
        for i, molecules in enumerate(self.prompt_molecules):
            yield MolecularSynthesisInstance(molecules=molecules, instance_id=i)

    @property
    def schema(self):
        """Get the schema class for this dataset.

        Returns:
            type[MolecularSynthesisInstance]: The Pydantic model class for molecular synthesis instances.
        """
        return MolecularSynthesisInstance

`init(prompt_molecules)`

Initialize the dataset with a list of molecules.

Parameters:

Name	Type	Description	Default
`prompt_molecules`		List of lists of molecules which will be used to generate prompts.	required

Source code in genlm/eval/domains/molecular_synthesis.py

def __init__(self, prompt_molecules):
    """Initialize the dataset with a list of molecules.

    Args:
        prompt_molecules: List of lists of molecules which will be used to generate prompts.
    """
    self.prompt_molecules = prompt_molecules

`from_smiles(smiles_path, n_molecules=20, n_instances=100, seed=1234)` `classmethod`

Load molecules from a SMILES file.

Parameters:

Name	Type	Description	Default
`smiles_path`	`str`	Path to the .smi file containing SMILES strings.	required
`n_molecules`	`int`	Number of molecules to sample.	`20`
`n_instances`	`int`	Number of instances to sample.	`100`
`seed`	`int`	Seed for the random number generator.	`1234`

Returns:

Name	Type	Description
`MolecularSynthesisDataset`		Dataset initialized with molecules from the SMILES.

Source code in genlm/eval/domains/molecular_synthesis.py

@classmethod
def from_smiles(cls, smiles_path, n_molecules=20, n_instances=100, seed=1234):
    """Load molecules from a SMILES file.

    Args:
        smiles_path (str): Path to the .smi file containing SMILES strings.
        n_molecules (int): Number of molecules to sample.
        n_instances (int): Number of instances to sample.
        seed (int): Seed for the random number generator.

    Returns:
        MolecularSynthesisDataset: Dataset initialized with molecules from the SMILES.
    """
    molecules = open(smiles_path).readlines()
    prompt_molecules = []
    random.seed(seed)
    for _ in range(n_instances):
        molecule_ids = random.sample(range(len(molecules)), n_molecules)
        prompt_molecules.append([molecules[i] for i in molecule_ids])
    return cls(prompt_molecules)

`iter()`

Iterate over molecules.

Returns:

Type	Description
	Iterator[MolecularSynthesisInstance]: Iterator over molecular synthesis instances.

Source code in genlm/eval/domains/molecular_synthesis.py

def __iter__(self):
    """Iterate over molecules.

    Returns:
        Iterator[MolecularSynthesisInstance]: Iterator over molecular synthesis instances.
    """
    for i, molecules in enumerate(self.prompt_molecules):
        yield MolecularSynthesisInstance(molecules=molecules, instance_id=i)

`schema` `property`

Get the schema class for this dataset.

Returns:

Type	Description
	type[MolecularSynthesisInstance]: The Pydantic model class for molecular synthesis instances.

`MolecularSynthesisEvaluator`

Bases: Evaluator[MolecularSynthesisInstance]

Evaluator for molecular synthesis.

Source code in genlm/eval/domains/molecular_synthesis.py

class MolecularSynthesisEvaluator(Evaluator[MolecularSynthesisInstance]):
    """Evaluator for molecular synthesis."""

    def evaluate_sample(self, instance, response):
        """Evaluate if a response matches the regex pattern.

        Args:
            instance (PatternMatchingInstance): The pattern matching instance being evaluated.
            response (str): The model's response text.

        Returns:
            (bool): Whether the response matches the pattern.
        """
        valid, acc = cached_eval(response.strip())
        desc = "valid" if valid else "invalid"
        return EvaluationResult(score=acc, desc=desc)

`evaluate_sample(instance, response)`

Evaluate if a response matches the regex pattern.

Parameters:

Name	Type	Description	Default
`instance`	`PatternMatchingInstance`	The pattern matching instance being evaluated.	required
`response`	`str`	The model's response text.	required

Returns:

Type	Description
`bool`	Whether the response matches the pattern.

Source code in genlm/eval/domains/molecular_synthesis.py

def evaluate_sample(self, instance, response):
    """Evaluate if a response matches the regex pattern.

    Args:
        instance (PatternMatchingInstance): The pattern matching instance being evaluated.
        response (str): The model's response text.

    Returns:
        (bool): Whether the response matches the pattern.
    """
    valid, acc = cached_eval(response.strip())
    desc = "valid" if valid else "invalid"
    return EvaluationResult(score=acc, desc=desc)

`default_prompt_formatter(tokenizer, instance, use_chat_format=False, system_prompt=SYSTEM_PROMPT)`

Default prompt formatter for molecular synthesis.

Parameters:

Name	Type	Description	Default
`tokenizer`	`Tokenizer`	The tokenizer to use.	required
`instance`	`MolecularSynthesisInstance`	The instance to format.	required
`use_chat_format`	`bool`	Whether to use chat format.	`False`
`system_prompt`	`str`	The system prompt to use.	`SYSTEM_PROMPT`

Returns:

Type	Description
`list[int]`	The prompt ids.

Source code in genlm/eval/domains/molecular_synthesis.py

def default_prompt_formatter(
    tokenizer,
    instance,
    use_chat_format=False,
    system_prompt=SYSTEM_PROMPT,
):
    """Default prompt formatter for molecular synthesis.

    Args:
        tokenizer (Tokenizer): The tokenizer to use.
        instance (MolecularSynthesisInstance): The instance to format.
        use_chat_format (bool): Whether to use chat format.
        system_prompt (str): The system prompt to use.

    Returns:
        (list[int]): The prompt ids.
    """
    if use_chat_format:
        raise NotImplementedError(
            "Chat format not implemented for molecular synthesis."
        )
    else:
        prompt_ids = tokenizer.encode(
            system_prompt
            + "\n"
            + "\n".join("Molecule: " + x for x in instance.molecules)
            + "\nMolecule:"
        )

    return prompt_ids

molecular_synthesis

genlm.eval.domains.molecular_synthesis

MolecularSynthesisInstance

MolecularSynthesisDataset

__init__(prompt_molecules)

from_smiles(smiles_path, n_molecules=20, n_instances=100, seed=1234) classmethod

__iter__()

schema property

MolecularSynthesisEvaluator

evaluate_sample(instance, response)

default_prompt_formatter(tokenizer, instance, use_chat_format=False, system_prompt=SYSTEM_PROMPT)

`genlm.eval.domains.molecular_synthesis`

`MolecularSynthesisInstance`

`MolecularSynthesisDataset`

`init(prompt_molecules)`

`from_smiles(smiles_path, n_molecules=20, n_instances=100, seed=1234)` `classmethod`

`iter()`

`schema` `property`

`MolecularSynthesisEvaluator`

`evaluate_sample(instance, response)`

`default_prompt_formatter(tokenizer, instance, use_chat_format=False, system_prompt=SYSTEM_PROMPT)`