Skip to content

evaluator

genlm.eval.core.evaluator

EvaluationResult

Bases: BaseModel

Class for storing evaluation results.

Source code in genlm/eval/core/evaluator.py
class EvaluationResult(BaseModel):
    """Class for storing evaluation results."""

    score: float
    desc: str
    metadata: Dict[str, Any] = {}

Evaluator

Bases: Generic[T], ABC

Base class for evaluators that handle response evaluation.

Source code in genlm/eval/core/evaluator.py
class Evaluator(Generic[T], ABC):
    """Base class for evaluators that handle response evaluation."""

    @abstractmethod
    def evaluate_sample(self, instance, response):
        """Evaluate a single response for correctness.

        Args:
            instance (T): The dataset instance being evaluated.
            response (Any): The model's response, which is given by the response attribute of a `ModelOutput` object.

        Returns:
            (EvaluationResult): The evaluation result.
        """
        pass  # pragma: no cover

    def evaluate_ensemble(self, instance: T, output: ModelOutput) -> Dict[str, Any]:
        """Evaluate the complete ensemble of weighted samples using weighted accuracy.

        Args:
            instance (T): The dataset instance being evaluated.
            output (ModelOutput): The complete model output including ensemble responses.

        Returns:
            (Dict[str, Any]): Dictionary containing evaluation metrics.
        """
        weighted_accuracy = 0.0
        results = []
        for response in output.responses:
            result = self.evaluate_sample(instance, response.response)
            weighted_accuracy += result.score * response.weight
            results.append(
                {
                    "score": result.score,
                    "desc": result.desc,
                    "metadata": result.metadata,
                }
            )

        return {
            "weighted_accuracy": weighted_accuracy,
            "runtime_seconds": output.runtime_seconds,
            "results": results,
        }

evaluate_sample(instance, response) abstractmethod

Evaluate a single response for correctness.

Parameters:

Name Type Description Default
instance T

The dataset instance being evaluated.

required
response Any

The model's response, which is given by the response attribute of a ModelOutput object.

required

Returns:

Type Description
EvaluationResult

The evaluation result.

Source code in genlm/eval/core/evaluator.py
@abstractmethod
def evaluate_sample(self, instance, response):
    """Evaluate a single response for correctness.

    Args:
        instance (T): The dataset instance being evaluated.
        response (Any): The model's response, which is given by the response attribute of a `ModelOutput` object.

    Returns:
        (EvaluationResult): The evaluation result.
    """
    pass  # pragma: no cover

evaluate_ensemble(instance, output)

Evaluate the complete ensemble of weighted samples using weighted accuracy.

Parameters:

Name Type Description Default
instance T

The dataset instance being evaluated.

required
output ModelOutput

The complete model output including ensemble responses.

required

Returns:

Type Description
Dict[str, Any]

Dictionary containing evaluation metrics.

Source code in genlm/eval/core/evaluator.py
def evaluate_ensemble(self, instance: T, output: ModelOutput) -> Dict[str, Any]:
    """Evaluate the complete ensemble of weighted samples using weighted accuracy.

    Args:
        instance (T): The dataset instance being evaluated.
        output (ModelOutput): The complete model output including ensemble responses.

    Returns:
        (Dict[str, Any]): Dictionary containing evaluation metrics.
    """
    weighted_accuracy = 0.0
    results = []
    for response in output.responses:
        result = self.evaluate_sample(instance, response.response)
        weighted_accuracy += result.score * response.weight
        results.append(
            {
                "score": result.score,
                "desc": result.desc,
                "metadata": result.metadata,
            }
        )

    return {
        "weighted_accuracy": weighted_accuracy,
        "runtime_seconds": output.runtime_seconds,
        "results": results,
    }