debeir.core.document
1import abc 2import dataclasses 3from collections import defaultdict 4from typing import Dict, List, Union 5 6from debeir.utils.utils import flatten 7 8 9@dataclasses.dataclass 10class Document: 11 """ 12 Generic Document class. 13 Used as an interface for interacting across multiple indexes with different mappings. 14 """ 15 doc_id: Union[int, float, str] 16 topic_num: Union[int, str, float] = None 17 facets: Dict = None 18 score: Union[float, int] = 0.0 # Primay Score 19 scores: Dict[str, Union[float, int]] = dataclasses.field( 20 default_factory=lambda: {}) # Include other scores if needed 21 22 @classmethod 23 @abc.abstractmethod 24 def from_results(cls, results, *args, **kwargs) -> Dict[Union[int, float], 'Document']: 25 """ 26 Produces a list of Document objects from raw results returned from the index 27 28 In the format {topic_num: [Document, ..., Document]} 29 """ 30 pass 31 32 def get_document_id(self): 33 """ 34 :return: 35 self.doc_id 36 """ 37 return self.doc_id 38 39 def flatten_facets(self, *args, **kwargs): 40 """ 41 Flattens multi-level internal document facets into a single level 42 e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower'] 43 :param args: 44 :param kwargs: 45 """ 46 self.facets = flatten(self.facets, *args, **kwargs) 47 48 @classmethod 49 def _get_document_facet(cls, intermediate_repr, key): 50 return intermediate_repr[key] 51 52 def get_document_facet(self, key, sep="_"): 53 """ 54 Retrieve a document facet 55 Works for multidimensional keys or single 56 :param key: Facet to retrieve 57 :param sep: The seperator for multidimensional key 58 :return: 59 Returns the document facet given the key (field) 60 """ 61 if sep in key: 62 keys = key.split(sep) 63 64 intermediate_repr = self.facets 65 for k in keys: 66 intermediate_repr = self._get_document_facet(intermediate_repr, k) 67 68 return intermediate_repr 69 70 return self.facets[key] 71 72 def set(self, doc_id=None, facets=None, score=None, facet=None, facet_value=None) -> 'Document': 73 """ 74 Set attributes of the object. Use keyword arguments to do so. Works as a builder class. 75 doc.set(doc_id="123").set(facets={"title": "my title"}) 76 :param doc_id: 77 :param facets: 78 :param score: 79 :param facet: 80 :param facet_value: 81 82 :return: 83 Returns document object 84 """ 85 if doc_id is not None: 86 self.doc_id = doc_id 87 88 if facets is not None: 89 self.facets = facets 90 91 if score is not None: 92 self.score = score 93 94 if facet is not None and facet_value is not None: 95 self.facets[facet] = facet_value 96 97 return self 98 99 def to_trec_format(self, rank, run_name) -> str: 100 """ 101 Returns TREC format for the document 102 :return: 103 A trec formatted string 104 """ 105 106 return f"{self.topic_num}\t" \ 107 f"Q0\t" \ 108 f"{self.doc_id}\t" \ 109 f"{rank}\t" \ 110 f"{self.score}\t" \ 111 f"{run_name}\n" 112 113 @classmethod 114 def get_trec_format(cls, ranked_list: List['Document'], run_name="NO_RUN_NAME", sort=True, sorting_func=None): 115 """ 116 Get the trec format of a list of ranked documents. This function is a generator. 117 118 :param ranked_list: A list of Document-type objects 119 :param run_name: Run name to print in the TREC formatted string 120 :param sort: Whether to sort the input list in descending order of score. 121 :param sorting_func: Custom sorting function will be used if provided 122 """ 123 124 if sort: 125 if sorting_func: 126 ranked_list = sorting_func(ranked_list) 127 else: 128 ranked_list.sort(key=lambda doc: doc.score, reverse=True) 129 130 for rank, document in enumerate(ranked_list, start=1): 131 yield document.to_trec_format(rank, run_name) 132 133 134class ElasticsearchDocument(Document): 135 @classmethod 136 def from_results(cls, results, query_cls, ignore_facets=True, 137 *args, **kwargs) -> Dict[Union[int, float], 'Document']: 138 139 documents = defaultdict(lambda: []) 140 141 for (topic_num, res) in results: 142 for rank, result in enumerate(res["hits"]["hits"], start=1): 143 doc_id = query_cls.get_id_mapping(result["_source"]) 144 facets = {} 145 146 if not ignore_facets: 147 facets = {k: v for (k, v) in result['_source'].items() if not k.startswith("_")} 148 149 documents[topic_num].append(ElasticsearchDocument(doc_id, 150 topic_num, 151 facets=facets, 152 score=float(result['_score']))) 153 154 documents[topic_num][-1].scores['rank'] = rank 155 156 return dict(documents) 157 158 159document_factory = { 160 "elasticsearch": ElasticsearchDocument 161}
10@dataclasses.dataclass 11class Document: 12 """ 13 Generic Document class. 14 Used as an interface for interacting across multiple indexes with different mappings. 15 """ 16 doc_id: Union[int, float, str] 17 topic_num: Union[int, str, float] = None 18 facets: Dict = None 19 score: Union[float, int] = 0.0 # Primay Score 20 scores: Dict[str, Union[float, int]] = dataclasses.field( 21 default_factory=lambda: {}) # Include other scores if needed 22 23 @classmethod 24 @abc.abstractmethod 25 def from_results(cls, results, *args, **kwargs) -> Dict[Union[int, float], 'Document']: 26 """ 27 Produces a list of Document objects from raw results returned from the index 28 29 In the format {topic_num: [Document, ..., Document]} 30 """ 31 pass 32 33 def get_document_id(self): 34 """ 35 :return: 36 self.doc_id 37 """ 38 return self.doc_id 39 40 def flatten_facets(self, *args, **kwargs): 41 """ 42 Flattens multi-level internal document facets into a single level 43 e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower'] 44 :param args: 45 :param kwargs: 46 """ 47 self.facets = flatten(self.facets, *args, **kwargs) 48 49 @classmethod 50 def _get_document_facet(cls, intermediate_repr, key): 51 return intermediate_repr[key] 52 53 def get_document_facet(self, key, sep="_"): 54 """ 55 Retrieve a document facet 56 Works for multidimensional keys or single 57 :param key: Facet to retrieve 58 :param sep: The seperator for multidimensional key 59 :return: 60 Returns the document facet given the key (field) 61 """ 62 if sep in key: 63 keys = key.split(sep) 64 65 intermediate_repr = self.facets 66 for k in keys: 67 intermediate_repr = self._get_document_facet(intermediate_repr, k) 68 69 return intermediate_repr 70 71 return self.facets[key] 72 73 def set(self, doc_id=None, facets=None, score=None, facet=None, facet_value=None) -> 'Document': 74 """ 75 Set attributes of the object. Use keyword arguments to do so. Works as a builder class. 76 doc.set(doc_id="123").set(facets={"title": "my title"}) 77 :param doc_id: 78 :param facets: 79 :param score: 80 :param facet: 81 :param facet_value: 82 83 :return: 84 Returns document object 85 """ 86 if doc_id is not None: 87 self.doc_id = doc_id 88 89 if facets is not None: 90 self.facets = facets 91 92 if score is not None: 93 self.score = score 94 95 if facet is not None and facet_value is not None: 96 self.facets[facet] = facet_value 97 98 return self 99 100 def to_trec_format(self, rank, run_name) -> str: 101 """ 102 Returns TREC format for the document 103 :return: 104 A trec formatted string 105 """ 106 107 return f"{self.topic_num}\t" \ 108 f"Q0\t" \ 109 f"{self.doc_id}\t" \ 110 f"{rank}\t" \ 111 f"{self.score}\t" \ 112 f"{run_name}\n" 113 114 @classmethod 115 def get_trec_format(cls, ranked_list: List['Document'], run_name="NO_RUN_NAME", sort=True, sorting_func=None): 116 """ 117 Get the trec format of a list of ranked documents. This function is a generator. 118 119 :param ranked_list: A list of Document-type objects 120 :param run_name: Run name to print in the TREC formatted string 121 :param sort: Whether to sort the input list in descending order of score. 122 :param sorting_func: Custom sorting function will be used if provided 123 """ 124 125 if sort: 126 if sorting_func: 127 ranked_list = sorting_func(ranked_list) 128 else: 129 ranked_list.sort(key=lambda doc: doc.score, reverse=True) 130 131 for rank, document in enumerate(ranked_list, start=1): 132 yield document.to_trec_format(rank, run_name)
Generic Document class. Used as an interface for interacting across multiple indexes with different mappings.
23 @classmethod 24 @abc.abstractmethod 25 def from_results(cls, results, *args, **kwargs) -> Dict[Union[int, float], 'Document']: 26 """ 27 Produces a list of Document objects from raw results returned from the index 28 29 In the format {topic_num: [Document, ..., Document]} 30 """ 31 pass
Produces a list of Document objects from raw results returned from the index
In the format {topic_num: [Document, ..., Document]}
40 def flatten_facets(self, *args, **kwargs): 41 """ 42 Flattens multi-level internal document facets into a single level 43 e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower'] 44 :param args: 45 :param kwargs: 46 """ 47 self.facets = flatten(self.facets, *args, **kwargs)
Flattens multi-level internal document facets into a single level e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
Parameters
- args:
- kwargs:
53 def get_document_facet(self, key, sep="_"): 54 """ 55 Retrieve a document facet 56 Works for multidimensional keys or single 57 :param key: Facet to retrieve 58 :param sep: The seperator for multidimensional key 59 :return: 60 Returns the document facet given the key (field) 61 """ 62 if sep in key: 63 keys = key.split(sep) 64 65 intermediate_repr = self.facets 66 for k in keys: 67 intermediate_repr = self._get_document_facet(intermediate_repr, k) 68 69 return intermediate_repr 70 71 return self.facets[key]
Retrieve a document facet Works for multidimensional keys or single
Parameters
- key: Facet to retrieve
- sep: The seperator for multidimensional key
Returns
Returns the document facet given the key (field)
73 def set(self, doc_id=None, facets=None, score=None, facet=None, facet_value=None) -> 'Document': 74 """ 75 Set attributes of the object. Use keyword arguments to do so. Works as a builder class. 76 doc.set(doc_id="123").set(facets={"title": "my title"}) 77 :param doc_id: 78 :param facets: 79 :param score: 80 :param facet: 81 :param facet_value: 82 83 :return: 84 Returns document object 85 """ 86 if doc_id is not None: 87 self.doc_id = doc_id 88 89 if facets is not None: 90 self.facets = facets 91 92 if score is not None: 93 self.score = score 94 95 if facet is not None and facet_value is not None: 96 self.facets[facet] = facet_value 97 98 return self
Set attributes of the object. Use keyword arguments to do so. Works as a builder class. doc.set(doc_id="123").set(facets={"title": "my title"})
Parameters
- doc_id:
- facets:
- score:
- facet:
- facet_value:
Returns
Returns document object
100 def to_trec_format(self, rank, run_name) -> str: 101 """ 102 Returns TREC format for the document 103 :return: 104 A trec formatted string 105 """ 106 107 return f"{self.topic_num}\t" \ 108 f"Q0\t" \ 109 f"{self.doc_id}\t" \ 110 f"{rank}\t" \ 111 f"{self.score}\t" \ 112 f"{run_name}\n"
Returns TREC format for the document
Returns
A trec formatted string
114 @classmethod 115 def get_trec_format(cls, ranked_list: List['Document'], run_name="NO_RUN_NAME", sort=True, sorting_func=None): 116 """ 117 Get the trec format of a list of ranked documents. This function is a generator. 118 119 :param ranked_list: A list of Document-type objects 120 :param run_name: Run name to print in the TREC formatted string 121 :param sort: Whether to sort the input list in descending order of score. 122 :param sorting_func: Custom sorting function will be used if provided 123 """ 124 125 if sort: 126 if sorting_func: 127 ranked_list = sorting_func(ranked_list) 128 else: 129 ranked_list.sort(key=lambda doc: doc.score, reverse=True) 130 131 for rank, document in enumerate(ranked_list, start=1): 132 yield document.to_trec_format(rank, run_name)
Get the trec format of a list of ranked documents. This function is a generator.
Parameters
- ranked_list: A list of Document-type objects
- run_name: Run name to print in the TREC formatted string
- sort: Whether to sort the input list in descending order of score.
- sorting_func: Custom sorting function will be used if provided
135class ElasticsearchDocument(Document): 136 @classmethod 137 def from_results(cls, results, query_cls, ignore_facets=True, 138 *args, **kwargs) -> Dict[Union[int, float], 'Document']: 139 140 documents = defaultdict(lambda: []) 141 142 for (topic_num, res) in results: 143 for rank, result in enumerate(res["hits"]["hits"], start=1): 144 doc_id = query_cls.get_id_mapping(result["_source"]) 145 facets = {} 146 147 if not ignore_facets: 148 facets = {k: v for (k, v) in result['_source'].items() if not k.startswith("_")} 149 150 documents[topic_num].append(ElasticsearchDocument(doc_id, 151 topic_num, 152 facets=facets, 153 score=float(result['_score']))) 154 155 documents[topic_num][-1].scores['rank'] = rank 156 157 return dict(documents)
Generic Document class. Used as an interface for interacting across multiple indexes with different mappings.
136 @classmethod 137 def from_results(cls, results, query_cls, ignore_facets=True, 138 *args, **kwargs) -> Dict[Union[int, float], 'Document']: 139 140 documents = defaultdict(lambda: []) 141 142 for (topic_num, res) in results: 143 for rank, result in enumerate(res["hits"]["hits"], start=1): 144 doc_id = query_cls.get_id_mapping(result["_source"]) 145 facets = {} 146 147 if not ignore_facets: 148 facets = {k: v for (k, v) in result['_source'].items() if not k.startswith("_")} 149 150 documents[topic_num].append(ElasticsearchDocument(doc_id, 151 topic_num, 152 facets=facets, 153 score=float(result['_score']))) 154 155 documents[topic_num][-1].scores['rank'] = rank 156 157 return dict(documents)
Produces a list of Document objects from raw results returned from the index
In the format {topic_num: [Document, ..., Document]}