debeir.evaluation.evaluator

 1from collections import defaultdict
 2from typing import Dict, List, Union
 3
 4import loguru
 5from analysis_tools_ir import evaluate, sigtests
 6from debeir.core.config import GenericConfig, MetricsConfig
 7
 8
 9class Evaluator:
10    """
11    Evaluation class for computing metrics from TREC-style files
12    """
13
14    def __init__(self, qrels: str, metrics: List[str]):
15        self.qrels = qrels
16        self.metrics = []
17        self.depths = []
18
19        try:
20            self._validate_and_setup_metrics(metrics)
21        except AssertionError:
22            raise ValueError("Metrics must be of the form metric@depth")
23
24    def _validate_and_setup_metrics(self, metrics):
25        for metric in metrics:
26            assert "@" in metric
27            try:
28                metric, depth = metric.split("@")
29            except:
30                raise RuntimeError(f"Unable to parse metric {metric}")
31
32            assert metric.isalpha()
33            assert depth.isdigit()
34
35            self.metrics.append(metric)
36            self.depths.append(int(depth))
37
38    def evaluate_runs(self, res: Union[str, List[str]], **kwargs):
39        """
40        Evaluates the TREC-style results from an input result list or file
41
42        :param res: Results file path or raw results list
43        :param kwargs: Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library
44        :return:
45        """
46        results = defaultdict(lambda: {})
47        for metric, depth in zip(self.metrics, self.depths):
48            results[metric][depth] = evaluate.parse_run(
49                res, self.qrels,
50                metric=metric, depth=depth,
51                **kwargs
52            )
53
54        return results
55
56    def average_all_metrics(self, runs: Dict, logger: loguru.logger):
57        """
58        Averages the metric per topic scores into a single averaged score.
59
60        :param runs: Parsed run dictionary: {metric_name@depth: Run object}
61        :param logger: Logger to print metrics
62        """
63        for metric, depth in zip(self.metrics, self.depths):
64            run = runs[metric][depth].run
65            logger.info(f"{metric}@{depth} Average: {sum(run.values()) / len(run):.4}")
66
67    def sigtests(self, results_a, results_b):
68        """
69        Run a paired significance test on two result files
70
71        :param results_a:
72        :param results_b:
73        :return:
74        """
75        return sigtests.paired.paired_t_test(results_a, results_b, self.qrels)
76
77    @classmethod
78    def build_from_config(cls, config: GenericConfig, metrics_config: MetricsConfig):
79        return cls(config.qrels, metrics_config.metrics)
class Evaluator:
10class Evaluator:
11    """
12    Evaluation class for computing metrics from TREC-style files
13    """
14
15    def __init__(self, qrels: str, metrics: List[str]):
16        self.qrels = qrels
17        self.metrics = []
18        self.depths = []
19
20        try:
21            self._validate_and_setup_metrics(metrics)
22        except AssertionError:
23            raise ValueError("Metrics must be of the form metric@depth")
24
25    def _validate_and_setup_metrics(self, metrics):
26        for metric in metrics:
27            assert "@" in metric
28            try:
29                metric, depth = metric.split("@")
30            except:
31                raise RuntimeError(f"Unable to parse metric {metric}")
32
33            assert metric.isalpha()
34            assert depth.isdigit()
35
36            self.metrics.append(metric)
37            self.depths.append(int(depth))
38
39    def evaluate_runs(self, res: Union[str, List[str]], **kwargs):
40        """
41        Evaluates the TREC-style results from an input result list or file
42
43        :param res: Results file path or raw results list
44        :param kwargs: Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library
45        :return:
46        """
47        results = defaultdict(lambda: {})
48        for metric, depth in zip(self.metrics, self.depths):
49            results[metric][depth] = evaluate.parse_run(
50                res, self.qrels,
51                metric=metric, depth=depth,
52                **kwargs
53            )
54
55        return results
56
57    def average_all_metrics(self, runs: Dict, logger: loguru.logger):
58        """
59        Averages the metric per topic scores into a single averaged score.
60
61        :param runs: Parsed run dictionary: {metric_name@depth: Run object}
62        :param logger: Logger to print metrics
63        """
64        for metric, depth in zip(self.metrics, self.depths):
65            run = runs[metric][depth].run
66            logger.info(f"{metric}@{depth} Average: {sum(run.values()) / len(run):.4}")
67
68    def sigtests(self, results_a, results_b):
69        """
70        Run a paired significance test on two result files
71
72        :param results_a:
73        :param results_b:
74        :return:
75        """
76        return sigtests.paired.paired_t_test(results_a, results_b, self.qrels)
77
78    @classmethod
79    def build_from_config(cls, config: GenericConfig, metrics_config: MetricsConfig):
80        return cls(config.qrels, metrics_config.metrics)

Evaluation class for computing metrics from TREC-style files

Evaluator(qrels: str, metrics: List[str])
15    def __init__(self, qrels: str, metrics: List[str]):
16        self.qrels = qrels
17        self.metrics = []
18        self.depths = []
19
20        try:
21            self._validate_and_setup_metrics(metrics)
22        except AssertionError:
23            raise ValueError("Metrics must be of the form metric@depth")
def evaluate_runs(self, res: Union[str, List[str]], **kwargs):
39    def evaluate_runs(self, res: Union[str, List[str]], **kwargs):
40        """
41        Evaluates the TREC-style results from an input result list or file
42
43        :param res: Results file path or raw results list
44        :param kwargs: Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library
45        :return:
46        """
47        results = defaultdict(lambda: {})
48        for metric, depth in zip(self.metrics, self.depths):
49            results[metric][depth] = evaluate.parse_run(
50                res, self.qrels,
51                metric=metric, depth=depth,
52                **kwargs
53            )
54
55        return results

Evaluates the TREC-style results from an input result list or file

Parameters
  • res: Results file path or raw results list
  • kwargs: Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library
Returns
def average_all_metrics( self, runs: Dict, logger: <loguru.logger handlers=[(id=0, level=10, sink=<_io.StringIO object at 0x103af2710>)]>):
57    def average_all_metrics(self, runs: Dict, logger: loguru.logger):
58        """
59        Averages the metric per topic scores into a single averaged score.
60
61        :param runs: Parsed run dictionary: {metric_name@depth: Run object}
62        :param logger: Logger to print metrics
63        """
64        for metric, depth in zip(self.metrics, self.depths):
65            run = runs[metric][depth].run
66            logger.info(f"{metric}@{depth} Average: {sum(run.values()) / len(run):.4}")

Averages the metric per topic scores into a single averaged score.

Parameters
  • runs: Parsed run dictionary: {metric_name@depth: Run object}
  • logger: Logger to print metrics
def sigtests(self, results_a, results_b):
68    def sigtests(self, results_a, results_b):
69        """
70        Run a paired significance test on two result files
71
72        :param results_a:
73        :param results_b:
74        :return:
75        """
76        return sigtests.paired.paired_t_test(results_a, results_b, self.qrels)

Run a paired significance test on two result files

Parameters
  • results_a:
  • results_b:
Returns
@classmethod
def build_from_config( cls, config: debeir.core.config.GenericConfig, metrics_config: debeir.core.config.MetricsConfig):
78    @classmethod
79    def build_from_config(cls, config: GenericConfig, metrics_config: MetricsConfig):
80        return cls(config.qrels, metrics_config.metrics)