Skip to content

Evaluator

gfmrag.evaluation

BaseEvaluator

Bases: ABC

Base evaluator class for evaluation tasks.

This abstract base class provides a foundation for implementing evaluators that assess model predictions. It handles loading prediction data from a JSON lines file where each line contains a single JSON object.

Parameters:

Name Type Description Default
prediction_file str

Path to the JSON lines prediction file to evaluate. Each line should contain a valid JSON object.

required

Attributes:

Name Type Description
data List[dict]

List of prediction data loaded from the JSON lines file.

Examples:

Python Console Session
>>> evaluator = MyEvaluator("predictions.jsonl")
>>> results = evaluator.evaluate()
Note

Subclasses must implement the evaluate() method to define evaluation logic.

Source code in gfmrag/evaluation/base_evaluator.py
Python
class BaseEvaluator(ABC):
    """Base evaluator class for evaluation tasks.

    This abstract base class provides a foundation for implementing evaluators
    that assess model predictions. It handles loading prediction data from a JSON
    lines file where each line contains a single JSON object.

    Args:
        prediction_file (str): Path to the JSON lines prediction file to evaluate.
            Each line should contain a valid JSON object.

    Attributes:
        data (List[dict]): List of prediction data loaded from the JSON lines file.

    Examples:
        >>> evaluator = MyEvaluator("predictions.jsonl")
        >>> results = evaluator.evaluate()

    Note:
        Subclasses must implement the `evaluate()` method to define evaluation logic.
    """

    def __init__(self, prediction_file: str) -> None:
        super().__init__()
        with open(prediction_file) as f:
            self.data = [json.loads(line) for line in f]

    @abstractmethod
    def evaluate(self) -> dict:
        pass

HotpotQAEvaluator

Bases: BaseEvaluator

HotpotQAEvaluator

Source code in gfmrag/evaluation/hotpot_qa_evaluator.py
Python
class HotpotQAEvaluator(BaseEvaluator):
    """
    HotpotQAEvaluator
    """

    def evaluate(self) -> dict:
        metrics = {"em": 0.0, "f1": 0.0, "precision": 0.0, "recall": 0.0}

        for pred in self.data:
            if "Answer: " in pred["response"]:
                pre_ans = pred["response"].split("Answer:")[1].strip()
            else:
                pre_ans = pred["response"]
            em, f1, prec, recall = update_answer(metrics, pre_ans, pred["answer"])

        n = len(self.data)
        for k in metrics.keys():
            metrics[k] /= n
        return metrics

MusiqueEvaluator

Bases: BaseEvaluator

MusiqueEvaluator

Source code in gfmrag/evaluation/musique_evaluator.py
Python
class MusiqueEvaluator(BaseEvaluator):
    """
    MusiqueEvaluator
    """

    def evaluate(self) -> dict:
        metrics = {"em": 0.0, "f1": 0.0, "precision": 0.0, "recall": 0.0}

        for pred in self.data:
            if "Answer: " in pred["response"]:
                pre_ans = pred["response"].split("Answer:")[1].strip()
            else:
                pre_ans = pred["response"]
            gold_answers = [pred["answer"]] + pred["answer_aliases"]
            em = metric_max_over_ground_truths(compute_exact, pre_ans, gold_answers)
            (
                f1,
                precision,
                recall,
            ) = metric_max_f1_over_ground_truths(compute_f1, pre_ans, gold_answers)
            metrics["em"] += float(em)
            metrics["f1"] += f1
            metrics["precision"] += precision
            metrics["recall"] += recall

        n = len(self.data)
        for k in metrics.keys():
            metrics[k] /= n
        return metrics

TwoWikiQAEvaluator

Bases: BaseEvaluator

TwoWikiQAEvaluator

Source code in gfmrag/evaluation/two_wiki_qa_evaluator.py
Python
class TwoWikiQAEvaluator(BaseEvaluator):
    """
    TwoWikiQAEvaluator
    """

    def evaluate(self) -> dict:
        metrics = {"em": 0.0, "f1": 0.0, "precision": 0.0, "recall": 0.0}

        for pred in self.data:
            if "Answer: " in pred["response"]:
                pre_ans = pred["response"].split("Answer:")[1].strip()
            else:
                pre_ans = pred["response"]
            gold_answers = [pred["answer"]] + pred["answer_aliases"]
            em, f1, prec, recall = update_answer(metrics, pre_ans, gold_answers)

        n = len(self.data)
        for k in metrics.keys():
            metrics[k] /= n
        return metrics