Skip to content

eval

genlm.eval

Instance

Bases: BaseModel

Base class for dataset instances that conform to a Pydantic schema.

Source code in genlm/eval/core/dataset.py
6
7
8
9
class Instance(BaseModel):
    """Base class for dataset instances that conform to a Pydantic schema."""

    instance_id: Union[int, str]

Dataset

Bases: Generic[T], ABC

Base class for datasets that yield instances conforming to a Pydantic schema.

Source code in genlm/eval/core/dataset.py
class Dataset(Generic[T], ABC):
    """Base class for datasets that yield instances conforming to a Pydantic schema."""

    @abstractmethod
    def __iter__(self) -> Iterator[T]:
        """Iterate over dataset instances.

        Returns:
            Iterator[T]: An iterator over instances conforming to schema T.
        """
        pass  # pragma: no cover

    @property
    @abstractmethod
    def schema(self) -> type[T]:
        """Get the Pydantic schema class for this dataset.

        Returns:
            type[T]: The Pydantic model class defining the schema.
        """
        pass  # pragma: no cover

__iter__() abstractmethod

Iterate over dataset instances.

Returns:

Type Description
Iterator[T]

Iterator[T]: An iterator over instances conforming to schema T.

Source code in genlm/eval/core/dataset.py
@abstractmethod
def __iter__(self) -> Iterator[T]:
    """Iterate over dataset instances.

    Returns:
        Iterator[T]: An iterator over instances conforming to schema T.
    """
    pass  # pragma: no cover

schema abstractmethod property

Get the Pydantic schema class for this dataset.

Returns:

Type Description
type[T]

type[T]: The Pydantic model class defining the schema.

Evaluator

Bases: Generic[T], ABC

Base class for evaluators that handle response evaluation.

Source code in genlm/eval/core/evaluator.py
class Evaluator(Generic[T], ABC):
    """Base class for evaluators that handle response evaluation."""

    @abstractmethod
    def evaluate_sample(self, instance, response):
        """Evaluate a single response for correctness.

        Args:
            instance (T): The dataset instance being evaluated.
            response (Any): The model's response, which is given by the response attribute of a `ModelOutput` object.

        Returns:
            (EvaluationResult): The evaluation result.
        """
        pass  # pragma: no cover

    def evaluate_ensemble(self, instance: T, output: ModelOutput) -> Dict[str, Any]:
        """Evaluate the complete ensemble of weighted samples using weighted accuracy.

        Args:
            instance (T): The dataset instance being evaluated.
            output (ModelOutput): The complete model output including ensemble responses.

        Returns:
            (Dict[str, Any]): Dictionary containing evaluation metrics.
        """
        weighted_accuracy = 0.0
        results = []
        for response in output.responses:
            result = self.evaluate_sample(instance, response.response)
            weighted_accuracy += result.score * response.weight
            results.append(
                {
                    "score": result.score,
                    "desc": result.desc,
                    "metadata": result.metadata,
                }
            )

        return {
            "weighted_accuracy": weighted_accuracy,
            "runtime_seconds": output.runtime_seconds,
            "results": results,
        }

evaluate_sample(instance, response) abstractmethod

Evaluate a single response for correctness.

Parameters:

Name Type Description Default
instance T

The dataset instance being evaluated.

required
response Any

The model's response, which is given by the response attribute of a ModelOutput object.

required

Returns:

Type Description
EvaluationResult

The evaluation result.

Source code in genlm/eval/core/evaluator.py
@abstractmethod
def evaluate_sample(self, instance, response):
    """Evaluate a single response for correctness.

    Args:
        instance (T): The dataset instance being evaluated.
        response (Any): The model's response, which is given by the response attribute of a `ModelOutput` object.

    Returns:
        (EvaluationResult): The evaluation result.
    """
    pass  # pragma: no cover

evaluate_ensemble(instance, output)

Evaluate the complete ensemble of weighted samples using weighted accuracy.

Parameters:

Name Type Description Default
instance T

The dataset instance being evaluated.

required
output ModelOutput

The complete model output including ensemble responses.

required

Returns:

Type Description
Dict[str, Any]

Dictionary containing evaluation metrics.

Source code in genlm/eval/core/evaluator.py
def evaluate_ensemble(self, instance: T, output: ModelOutput) -> Dict[str, Any]:
    """Evaluate the complete ensemble of weighted samples using weighted accuracy.

    Args:
        instance (T): The dataset instance being evaluated.
        output (ModelOutput): The complete model output including ensemble responses.

    Returns:
        (Dict[str, Any]): Dictionary containing evaluation metrics.
    """
    weighted_accuracy = 0.0
    results = []
    for response in output.responses:
        result = self.evaluate_sample(instance, response.response)
        weighted_accuracy += result.score * response.weight
        results.append(
            {
                "score": result.score,
                "desc": result.desc,
                "metadata": result.metadata,
            }
        )

    return {
        "weighted_accuracy": weighted_accuracy,
        "runtime_seconds": output.runtime_seconds,
        "results": results,
    }

EvaluationResult

Bases: BaseModel

Class for storing evaluation results.

Source code in genlm/eval/core/evaluator.py
class EvaluationResult(BaseModel):
    """Class for storing evaluation results."""

    score: float
    desc: str
    metadata: Dict[str, Any] = {}

ModelOutput

Bases: BaseModel

Collection of model responses with execution metadata.

Source code in genlm/eval/core/model.py
class ModelOutput(BaseModel):
    """Collection of model responses with execution metadata."""

    responses: List[ModelResponse]
    runtime_seconds: Optional[float] = None
    metadata: Optional[Dict[str, Any]] = None

ModelResponse

Bases: BaseModel

Single model response containing generated text, probability, and optional metadata.

Source code in genlm/eval/core/model.py
class ModelResponse(BaseModel):
    """Single model response containing generated text, probability, and optional metadata."""

    response: Any
    weight: float
    metadata: Optional[Dict[str, Any]] = None

run_evaluation(dataset, model, evaluator, output_dir=None, n_replicates=1, overwrite_results=False, overwrite_outputs=False, max_instances=float('inf'), verbosity=0) async

Run evaluation on a dataset using the provided model and evaluator.

Parameters:

Name Type Description Default
dataset Dataset

The dataset to evaluate on.

required
model ModelAdaptor

The model adaptor to use for generation.

required
evaluator Evaluator

The evaluator to use for prompt generation and evaluation.

required
output_dir str

The directory to save the results. Defaults to None, in which case results are not saved.

None
n_replicates int

Number of times to replicate the evaluation. Defaults to 1.

1
overwrite_results bool

Whether to overwrite existing evaluation results. Defaults to False.

False
overwrite_outputs bool

Whether to overwrite existing output. Defaults to False.

False
max_instances int

The maximum number of instances to evaluate. Defaults to float("inf").

float('inf')
verbosity int

The verbosity of the evaluation. Defaults to 0, which is silent.

0

Returns:

Type Description
Dict[str, Any]

Aggregated evaluation results.

Source code in genlm/eval/core/runner.py
async def run_evaluation(
    dataset,
    model,
    evaluator,
    output_dir=None,
    n_replicates=1,
    overwrite_results=False,
    overwrite_outputs=False,
    max_instances=float("inf"),
    verbosity=0,
):
    """Run evaluation on a dataset using the provided model and evaluator.

    Args:
        dataset (Dataset): The dataset to evaluate on.
        model (ModelAdaptor): The model adaptor to use for generation.
        evaluator (Evaluator): The evaluator to use for prompt generation and evaluation.
        output_dir (str, optional): The directory to save the results. Defaults to None, in which case results are not saved.
        n_replicates (int, optional): Number of times to replicate the evaluation. Defaults to 1.
        overwrite_results (bool, optional): Whether to overwrite existing evaluation results. Defaults to False.
        overwrite_outputs (bool, optional): Whether to overwrite existing output. Defaults to False.
        max_instances (int, optional): The maximum number of instances to evaluate. Defaults to float("inf").
        verbosity (int, optional): The verbosity of the evaluation. Defaults to 0, which is silent.

    Returns:
        (Dict[str, Any]): Aggregated evaluation results.
    """
    all_results = []
    all_instance_results = []
    all_instance_outputs = []

    if overwrite_outputs and not overwrite_results:
        raise ValueError(
            "Cannot overwrite outputs without overwriting results. (Hint: set overwrite_results=True)"
        )

    if output_dir is not None and not os.path.exists(output_dir):
        os.makedirs(output_dir)  # pragma: no cover

    n_instances = 0
    for instance in dataset:
        n_instances += 1

        instance_results = []
        instance_outputs = []
        instance_id = instance.instance_id

        for i in range(n_replicates):
            output = None
            result = None
            if output_dir is not None:
                instance_output_path = os.path.join(
                    output_dir, f"{instance_id}-{i}-output.json"
                )
                instance_results_path = os.path.join(
                    output_dir, f"{instance_id}-{i}-results.json"
                )

                # Try loading cached files if not overwriting
                if not overwrite_outputs:
                    output = _load_cached_output(instance_output_path)
                if not overwrite_results:
                    result = _load_cached_results(instance_results_path)
            else:
                instance_output_path = None
                instance_results_path = None

            # Generate new output if needed
            wrote_output = False
            if output is None:
                output = await model(instance, output_dir, replicate=i)
                if instance_output_path is not None:
                    wrote_output = True
                    _save_output(output, instance_output_path)

            # Evaluate if we need new results (no results, overwriting results, or wrote new output)
            if result is None or overwrite_results or wrote_output:
                result = evaluator.evaluate_ensemble(instance, output)
                if instance_results_path is not None:
                    _save_results(result, instance_results_path)

            instance_results.append(result)
            instance_outputs.append(output)

        avg_instance_result = {
            "weighted_accuracy": sum(r["weighted_accuracy"] for r in instance_results)
            / n_replicates,
        }
        all_results.append(avg_instance_result)
        all_instance_results.append(instance_results)
        all_instance_outputs.append(instance_outputs)

        if verbosity > 0:
            print(f"Instance {instance}")
            print(
                f"Mean weighted accuracy (instance): {avg_instance_result['weighted_accuracy']}"
            )
            print(
                f"Mean weighted accuracy (total): {sum(r['weighted_accuracy'] for r in all_results) / len(all_results)}"
            )
            print()

        if n_instances >= max_instances:
            break

    return {
        "average_weighted_accuracy": sum(r["weighted_accuracy"] for r in all_results)
        / len(all_results),
        "n_instances": len(all_results),
        "all_instance_results": all_instance_results,
        "all_instance_outputs": all_instance_outputs,
    }