debeir.core.document

  1import abc
  2import dataclasses
  3from collections import defaultdict
  4from typing import Dict, List, Union
  5
  6from debeir.utils.utils import flatten
  7
  8
  9@dataclasses.dataclass
 10class Document:
 11    """
 12    Generic Document class.
 13    Used as an interface for interacting across multiple indexes with different mappings.
 14    """
 15    doc_id: Union[int, float, str]
 16    topic_num: Union[int, str, float] = None
 17    facets: Dict = None
 18    score: Union[float, int] = 0.0  # Primay Score
 19    scores: Dict[str, Union[float, int]] = dataclasses.field(
 20        default_factory=lambda: {})  # Include other scores if needed
 21
 22    @classmethod
 23    @abc.abstractmethod
 24    def from_results(cls, results, *args, **kwargs) -> Dict[Union[int, float], 'Document']:
 25        """
 26        Produces a list of Document objects from raw results returned from the index
 27
 28        In the format {topic_num: [Document, ..., Document]}
 29        """
 30        pass
 31
 32    def get_document_id(self):
 33        """
 34        :return:
 35            self.doc_id
 36        """
 37        return self.doc_id
 38
 39    def flatten_facets(self, *args, **kwargs):
 40        """
 41        Flattens multi-level internal document facets into a single level
 42            e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
 43        :param args:
 44        :param kwargs:
 45        """
 46        self.facets = flatten(self.facets, *args, **kwargs)
 47
 48    @classmethod
 49    def _get_document_facet(cls, intermediate_repr, key):
 50        return intermediate_repr[key]
 51
 52    def get_document_facet(self, key, sep="_"):
 53        """
 54        Retrieve a document facet
 55        Works for multidimensional keys or single
 56        :param key: Facet to retrieve
 57        :param sep: The seperator for multidimensional key
 58        :return:
 59            Returns the document facet given the key (field)
 60        """
 61        if sep in key:
 62            keys = key.split(sep)
 63
 64            intermediate_repr = self.facets
 65            for k in keys:
 66                intermediate_repr = self._get_document_facet(intermediate_repr, k)
 67
 68            return intermediate_repr
 69
 70        return self.facets[key]
 71
 72    def set(self, doc_id=None, facets=None, score=None, facet=None, facet_value=None) -> 'Document':
 73        """
 74        Set attributes of the object. Use keyword arguments to do so. Works as a builder class.
 75        doc.set(doc_id="123").set(facets={"title": "my title"})
 76        :param doc_id:
 77        :param facets:
 78        :param score:
 79        :param facet:
 80        :param facet_value:
 81
 82        :return:
 83            Returns document object
 84        """
 85        if doc_id is not None:
 86            self.doc_id = doc_id
 87
 88        if facets is not None:
 89            self.facets = facets
 90
 91        if score is not None:
 92            self.score = score
 93
 94        if facet is not None and facet_value is not None:
 95            self.facets[facet] = facet_value
 96
 97        return self
 98
 99    def to_trec_format(self, rank, run_name) -> str:
100        """
101        Returns TREC format for the document
102        :return:
103            A trec formatted string
104        """
105
106        return f"{self.topic_num}\t" \
107               f"Q0\t" \
108               f"{self.doc_id}\t" \
109               f"{rank}\t" \
110               f"{self.score}\t" \
111               f"{run_name}\n"
112
113    @classmethod
114    def get_trec_format(cls, ranked_list: List['Document'], run_name="NO_RUN_NAME", sort=True, sorting_func=None):
115        """
116        Get the trec format of a list of ranked documents. This function is a generator.
117
118        :param ranked_list: A list of Document-type objects
119        :param run_name: Run name to print in the TREC formatted string
120        :param sort: Whether to sort the input list in descending order of score.
121        :param sorting_func: Custom sorting function will be used if provided
122        """
123
124        if sort:
125            if sorting_func:
126                ranked_list = sorting_func(ranked_list)
127            else:
128                ranked_list.sort(key=lambda doc: doc.score, reverse=True)
129
130        for rank, document in enumerate(ranked_list, start=1):
131            yield document.to_trec_format(rank, run_name)
132
133
134class ElasticsearchDocument(Document):
135    @classmethod
136    def from_results(cls, results, query_cls, ignore_facets=True,
137                     *args, **kwargs) -> Dict[Union[int, float], 'Document']:
138
139        documents = defaultdict(lambda: [])
140
141        for (topic_num, res) in results:
142            for rank, result in enumerate(res["hits"]["hits"], start=1):
143                doc_id = query_cls.get_id_mapping(result["_source"])
144                facets = {}
145
146                if not ignore_facets:
147                    facets = {k: v for (k, v) in result['_source'].items() if not k.startswith("_")}
148
149                documents[topic_num].append(ElasticsearchDocument(doc_id,
150                                                                  topic_num,
151                                                                  facets=facets,
152                                                                  score=float(result['_score'])))
153
154                documents[topic_num][-1].scores['rank'] = rank
155
156        return dict(documents)
157
158
159document_factory = {
160    "elasticsearch": ElasticsearchDocument
161}
@dataclasses.dataclass
class Document:
 10@dataclasses.dataclass
 11class Document:
 12    """
 13    Generic Document class.
 14    Used as an interface for interacting across multiple indexes with different mappings.
 15    """
 16    doc_id: Union[int, float, str]
 17    topic_num: Union[int, str, float] = None
 18    facets: Dict = None
 19    score: Union[float, int] = 0.0  # Primay Score
 20    scores: Dict[str, Union[float, int]] = dataclasses.field(
 21        default_factory=lambda: {})  # Include other scores if needed
 22
 23    @classmethod
 24    @abc.abstractmethod
 25    def from_results(cls, results, *args, **kwargs) -> Dict[Union[int, float], 'Document']:
 26        """
 27        Produces a list of Document objects from raw results returned from the index
 28
 29        In the format {topic_num: [Document, ..., Document]}
 30        """
 31        pass
 32
 33    def get_document_id(self):
 34        """
 35        :return:
 36            self.doc_id
 37        """
 38        return self.doc_id
 39
 40    def flatten_facets(self, *args, **kwargs):
 41        """
 42        Flattens multi-level internal document facets into a single level
 43            e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
 44        :param args:
 45        :param kwargs:
 46        """
 47        self.facets = flatten(self.facets, *args, **kwargs)
 48
 49    @classmethod
 50    def _get_document_facet(cls, intermediate_repr, key):
 51        return intermediate_repr[key]
 52
 53    def get_document_facet(self, key, sep="_"):
 54        """
 55        Retrieve a document facet
 56        Works for multidimensional keys or single
 57        :param key: Facet to retrieve
 58        :param sep: The seperator for multidimensional key
 59        :return:
 60            Returns the document facet given the key (field)
 61        """
 62        if sep in key:
 63            keys = key.split(sep)
 64
 65            intermediate_repr = self.facets
 66            for k in keys:
 67                intermediate_repr = self._get_document_facet(intermediate_repr, k)
 68
 69            return intermediate_repr
 70
 71        return self.facets[key]
 72
 73    def set(self, doc_id=None, facets=None, score=None, facet=None, facet_value=None) -> 'Document':
 74        """
 75        Set attributes of the object. Use keyword arguments to do so. Works as a builder class.
 76        doc.set(doc_id="123").set(facets={"title": "my title"})
 77        :param doc_id:
 78        :param facets:
 79        :param score:
 80        :param facet:
 81        :param facet_value:
 82
 83        :return:
 84            Returns document object
 85        """
 86        if doc_id is not None:
 87            self.doc_id = doc_id
 88
 89        if facets is not None:
 90            self.facets = facets
 91
 92        if score is not None:
 93            self.score = score
 94
 95        if facet is not None and facet_value is not None:
 96            self.facets[facet] = facet_value
 97
 98        return self
 99
100    def to_trec_format(self, rank, run_name) -> str:
101        """
102        Returns TREC format for the document
103        :return:
104            A trec formatted string
105        """
106
107        return f"{self.topic_num}\t" \
108               f"Q0\t" \
109               f"{self.doc_id}\t" \
110               f"{rank}\t" \
111               f"{self.score}\t" \
112               f"{run_name}\n"
113
114    @classmethod
115    def get_trec_format(cls, ranked_list: List['Document'], run_name="NO_RUN_NAME", sort=True, sorting_func=None):
116        """
117        Get the trec format of a list of ranked documents. This function is a generator.
118
119        :param ranked_list: A list of Document-type objects
120        :param run_name: Run name to print in the TREC formatted string
121        :param sort: Whether to sort the input list in descending order of score.
122        :param sorting_func: Custom sorting function will be used if provided
123        """
124
125        if sort:
126            if sorting_func:
127                ranked_list = sorting_func(ranked_list)
128            else:
129                ranked_list.sort(key=lambda doc: doc.score, reverse=True)
130
131        for rank, document in enumerate(ranked_list, start=1):
132            yield document.to_trec_format(rank, run_name)

Generic Document class. Used as an interface for interacting across multiple indexes with different mappings.

Document( doc_id: Union[int, float, str], topic_num: Union[int, str, float] = None, facets: Dict = None, score: Union[float, int] = 0.0, scores: Dict[str, Union[float, int]] = <factory>)
@classmethod
@abc.abstractmethod
def from_results( cls, results, *args, **kwargs) -> Dict[Union[int, float], debeir.core.document.Document]:
23    @classmethod
24    @abc.abstractmethod
25    def from_results(cls, results, *args, **kwargs) -> Dict[Union[int, float], 'Document']:
26        """
27        Produces a list of Document objects from raw results returned from the index
28
29        In the format {topic_num: [Document, ..., Document]}
30        """
31        pass

Produces a list of Document objects from raw results returned from the index

In the format {topic_num: [Document, ..., Document]}

def get_document_id(self):
33    def get_document_id(self):
34        """
35        :return:
36            self.doc_id
37        """
38        return self.doc_id
Returns
self.doc_id
def flatten_facets(self, *args, **kwargs):
40    def flatten_facets(self, *args, **kwargs):
41        """
42        Flattens multi-level internal document facets into a single level
43            e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
44        :param args:
45        :param kwargs:
46        """
47        self.facets = flatten(self.facets, *args, **kwargs)

Flattens multi-level internal document facets into a single level e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']

Parameters
  • args:
  • kwargs:
def get_document_facet(self, key, sep='_'):
53    def get_document_facet(self, key, sep="_"):
54        """
55        Retrieve a document facet
56        Works for multidimensional keys or single
57        :param key: Facet to retrieve
58        :param sep: The seperator for multidimensional key
59        :return:
60            Returns the document facet given the key (field)
61        """
62        if sep in key:
63            keys = key.split(sep)
64
65            intermediate_repr = self.facets
66            for k in keys:
67                intermediate_repr = self._get_document_facet(intermediate_repr, k)
68
69            return intermediate_repr
70
71        return self.facets[key]

Retrieve a document facet Works for multidimensional keys or single

Parameters
  • key: Facet to retrieve
  • sep: The seperator for multidimensional key
Returns
Returns the document facet given the key (field)
def set( self, doc_id=None, facets=None, score=None, facet=None, facet_value=None) -> debeir.core.document.Document:
73    def set(self, doc_id=None, facets=None, score=None, facet=None, facet_value=None) -> 'Document':
74        """
75        Set attributes of the object. Use keyword arguments to do so. Works as a builder class.
76        doc.set(doc_id="123").set(facets={"title": "my title"})
77        :param doc_id:
78        :param facets:
79        :param score:
80        :param facet:
81        :param facet_value:
82
83        :return:
84            Returns document object
85        """
86        if doc_id is not None:
87            self.doc_id = doc_id
88
89        if facets is not None:
90            self.facets = facets
91
92        if score is not None:
93            self.score = score
94
95        if facet is not None and facet_value is not None:
96            self.facets[facet] = facet_value
97
98        return self

Set attributes of the object. Use keyword arguments to do so. Works as a builder class. doc.set(doc_id="123").set(facets={"title": "my title"})

Parameters
  • doc_id:
  • facets:
  • score:
  • facet:
  • facet_value:
Returns
Returns document object
def to_trec_format(self, rank, run_name) -> str:
100    def to_trec_format(self, rank, run_name) -> str:
101        """
102        Returns TREC format for the document
103        :return:
104            A trec formatted string
105        """
106
107        return f"{self.topic_num}\t" \
108               f"Q0\t" \
109               f"{self.doc_id}\t" \
110               f"{rank}\t" \
111               f"{self.score}\t" \
112               f"{run_name}\n"

Returns TREC format for the document

Returns
A trec formatted string
@classmethod
def get_trec_format( cls, ranked_list: List[debeir.core.document.Document], run_name='NO_RUN_NAME', sort=True, sorting_func=None):
114    @classmethod
115    def get_trec_format(cls, ranked_list: List['Document'], run_name="NO_RUN_NAME", sort=True, sorting_func=None):
116        """
117        Get the trec format of a list of ranked documents. This function is a generator.
118
119        :param ranked_list: A list of Document-type objects
120        :param run_name: Run name to print in the TREC formatted string
121        :param sort: Whether to sort the input list in descending order of score.
122        :param sorting_func: Custom sorting function will be used if provided
123        """
124
125        if sort:
126            if sorting_func:
127                ranked_list = sorting_func(ranked_list)
128            else:
129                ranked_list.sort(key=lambda doc: doc.score, reverse=True)
130
131        for rank, document in enumerate(ranked_list, start=1):
132            yield document.to_trec_format(rank, run_name)

Get the trec format of a list of ranked documents. This function is a generator.

Parameters
  • ranked_list: A list of Document-type objects
  • run_name: Run name to print in the TREC formatted string
  • sort: Whether to sort the input list in descending order of score.
  • sorting_func: Custom sorting function will be used if provided
class ElasticsearchDocument(Document):
135class ElasticsearchDocument(Document):
136    @classmethod
137    def from_results(cls, results, query_cls, ignore_facets=True,
138                     *args, **kwargs) -> Dict[Union[int, float], 'Document']:
139
140        documents = defaultdict(lambda: [])
141
142        for (topic_num, res) in results:
143            for rank, result in enumerate(res["hits"]["hits"], start=1):
144                doc_id = query_cls.get_id_mapping(result["_source"])
145                facets = {}
146
147                if not ignore_facets:
148                    facets = {k: v for (k, v) in result['_source'].items() if not k.startswith("_")}
149
150                documents[topic_num].append(ElasticsearchDocument(doc_id,
151                                                                  topic_num,
152                                                                  facets=facets,
153                                                                  score=float(result['_score'])))
154
155                documents[topic_num][-1].scores['rank'] = rank
156
157        return dict(documents)

Generic Document class. Used as an interface for interacting across multiple indexes with different mappings.

@classmethod
def from_results( cls, results, query_cls, ignore_facets=True, *args, **kwargs) -> Dict[Union[int, float], debeir.core.document.Document]:
136    @classmethod
137    def from_results(cls, results, query_cls, ignore_facets=True,
138                     *args, **kwargs) -> Dict[Union[int, float], 'Document']:
139
140        documents = defaultdict(lambda: [])
141
142        for (topic_num, res) in results:
143            for rank, result in enumerate(res["hits"]["hits"], start=1):
144                doc_id = query_cls.get_id_mapping(result["_source"])
145                facets = {}
146
147                if not ignore_facets:
148                    facets = {k: v for (k, v) in result['_source'].items() if not k.startswith("_")}
149
150                documents[topic_num].append(ElasticsearchDocument(doc_id,
151                                                                  topic_num,
152                                                                  facets=facets,
153                                                                  score=float(result['_score'])))
154
155                documents[topic_num][-1].scores['rank'] = rank
156
157        return dict(documents)

Produces a list of Document objects from raw results returned from the index

In the format {topic_num: [Document, ..., Document]}