debeir.core.parser

  1import abc
  2import csv
  3import dataclasses
  4import json
  5from collections import defaultdict
  6from dataclasses import dataclass
  7from typing import Dict, List
  8from xml.etree import ElementTree as ET
  9
 10import dill
 11import pandas as pd
 12
 13
 14# TODO: Parse fields can come from a config or ID_fields
 15# TODO: move _get_topics to private cls method with arguments, and expose get_topics as an instance method.
 16
 17
 18@dataclass(init=True)
 19class Parser:
 20    """
 21    Parser interface
 22    """
 23
 24    id_field: object
 25    parse_fields: List[str]
 26
 27    @classmethod
 28    def normalize(cls, input_dict) -> Dict:
 29        """
 30        Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
 31
 32        :param input_dict:
 33        :return:
 34        """
 35        return pd.io.json.json_normalize(input_dict,
 36                                         sep=".").to_dict(orient='records')[0]
 37
 38    def get_topics(self, path, *args, **kwargs):
 39        """
 40        Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
 41        """
 42
 43        self_kwargs = vars(self)
 44        kwargs.update(self_kwargs)
 45
 46        return self._get_topics(path, *args, **kwargs)
 47
 48    @classmethod
 49    @abc.abstractmethod
 50    def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
 51        raise NotImplementedError
 52
 53
 54@dataclasses.dataclass(init=True)
 55class PickleParser(Parser):
 56    """
 57    Load topics from a pickle file
 58    """
 59
 60    @classmethod
 61    def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
 62        return dill.load(path)
 63
 64
 65@dataclasses.dataclass(init=True)
 66class XMLParser(Parser):
 67    """
 68    Load topics from an XML file
 69    """
 70    topic_field_name: str
 71    id_field: str
 72    parse_fields: List[str]
 73
 74    @classmethod
 75    def _recurse_to_child_node(cls, node: ET.Element, track: List):
 76        """
 77        Helper method to get all children nodes for text extraction in an xml.
 78
 79        :param node: Current node
 80        :param track: List to track nodes
 81        :return:
 82        """
 83        if len(node.getchildren()) > 0:
 84            for child in node.getchildren():
 85                track.append(cls._recurse_to_child_node(child, track))
 86
 87        return node
 88
 89    @classmethod
 90    def unwrap(cls, doc_dict, key):
 91        """
 92        Converts defaultdict to dict and list of size 1 to just the element
 93
 94        :param doc_dict:
 95        :param key:
 96        """
 97        if isinstance(doc_dict[key], defaultdict):
 98            doc_dict[key] = dict(doc_dict[key])
 99
100            for e_key in doc_dict[key]:
101                cls.unwrap(doc_dict[key], e_key)
102
103        if isinstance(doc_dict[key], list):
104            if len(doc_dict[key]) == 1:
105                doc_dict[key] = doc_dict[key][0]
106
107    def _get_topics(self, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
108        all_topics = ET.parse(path).getroot()
109        qtopics = {}
110
111        for topic in all_topics.findall(self.topic_field_name):
112            _id = topic.attrib[self.id_field]
113            if _id.isnumeric():
114                _id = int(_id)
115
116            if self.parse_fields:
117                temp = {}
118                for field in self.parse_fields:
119                    try:
120                        temp[field] = topic.find(field).text.strip()
121                    except:
122                        continue
123
124                qtopics[_id] = temp
125            else:
126                #  The topic contains the text
127                qtopics[_id] = {"query": topic.text.strip()}
128
129        return qtopics
130
131
132@dataclasses.dataclass
133class CSVParser(Parser):
134    """
135    Loads topics from a CSV file
136    """
137    id_field = "id"
138    parse_fields = ["Text"]
139
140    def __init__(self, id_field=None, parse_fields=None):
141        if parse_fields is None:
142            parse_fields = ["id", "text"]
143
144        if id_field is None:
145            id_field = "id"
146
147        super().__init__(id_field, parse_fields)
148
149    @classmethod
150    def _get_topics(cls, csvfile, dialect="excel",
151                    id_field: str = None,
152                    parse_fields: List[str] = None,
153                    *args, **kwargs) -> Dict[int, Dict[str, str]]:
154        topics = {}
155
156        if isinstance(csvfile, str):
157            csvfile = open(csvfile, 'rt')
158
159        if id_field is None:
160            id_field = cls.id_field
161
162        if parse_fields is None:
163            parse_fields = cls.parse_fields
164
165        reader = csv.DictReader(csvfile, dialect=dialect)
166        for row in reader:
167            temp = {}
168
169            for field in parse_fields:
170                temp[field] = row[field]
171
172            topics[row[id_field]] = temp
173
174        return topics
175
176
177@dataclasses.dataclass(init=True)
178class TSVParser(CSVParser):
179
180    @classmethod
181    def _get_topics(cls, tsvfile, *args, **kwargs) -> Dict[int, Dict[str, str]]:
182        return CSVParser._get_topics(tsvfile, *args, dialect='excel-tab', **kwargs)
183
184
185@dataclasses.dataclass(init=True)
186class JsonLinesParser(Parser):
187    """
188    Loads topics from a jsonl file,
189    a JSON per line
190
191    Provide parse_fields, id_field and whether to ignore full matches on json keys
192    secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.
193    """
194    parse_fields: List[str]
195    id_field: str
196    ignore_full_match: bool = True
197    secondary_id: str = None
198
199    @classmethod
200    def _get_topics(cls, jsonlfile, id_field, parse_fields,
201                    ignore_full_match=True, secondary_id=None, *args, **kwargs) -> Dict[str, Dict]:
202        with open(jsonlfile, "r") as jsonl_f:
203            topics = {}
204
205            for jsonl in jsonl_f:
206                json_dict = json.loads(jsonl)
207                _id = json_dict.pop(id_field)
208
209                if secondary_id:
210                    _id = str(_id) + "_" + str(json_dict[secondary_id])
211
212                for key in list(json_dict.keys()):
213                    found = False
214                    for _key in parse_fields:
215                        if ignore_full_match:
216                            if key in _key or key == _key or _key in key:
217                                found = True
218                        else:
219                            if _key == key:
220                                found = True
221                    if not found:
222                        json_dict.pop(key)
223
224                topics[_id] = json_dict
225
226        return topics
@dataclass(init=True)
class Parser:
19@dataclass(init=True)
20class Parser:
21    """
22    Parser interface
23    """
24
25    id_field: object
26    parse_fields: List[str]
27
28    @classmethod
29    def normalize(cls, input_dict) -> Dict:
30        """
31        Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
32
33        :param input_dict:
34        :return:
35        """
36        return pd.io.json.json_normalize(input_dict,
37                                         sep=".").to_dict(orient='records')[0]
38
39    def get_topics(self, path, *args, **kwargs):
40        """
41        Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
42        """
43
44        self_kwargs = vars(self)
45        kwargs.update(self_kwargs)
46
47        return self._get_topics(path, *args, **kwargs)
48
49    @classmethod
50    @abc.abstractmethod
51    def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
52        raise NotImplementedError

Parser interface

Parser(id_field: object, parse_fields: List[str])
@classmethod
def normalize(cls, input_dict) -> Dict:
28    @classmethod
29    def normalize(cls, input_dict) -> Dict:
30        """
31        Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
32
33        :param input_dict:
34        :return:
35        """
36        return pd.io.json.json_normalize(input_dict,
37                                         sep=".").to_dict(orient='records')[0]

Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]

Parameters
  • input_dict:
Returns
def get_topics(self, path, *args, **kwargs):
39    def get_topics(self, path, *args, **kwargs):
40        """
41        Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
42        """
43
44        self_kwargs = vars(self)
45        kwargs.update(self_kwargs)
46
47        return self._get_topics(path, *args, **kwargs)

Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

@dataclasses.dataclass(init=True)
class PickleParser(Parser):
55@dataclasses.dataclass(init=True)
56class PickleParser(Parser):
57    """
58    Load topics from a pickle file
59    """
60
61    @classmethod
62    def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
63        return dill.load(path)

Load topics from a pickle file

PickleParser(id_field: object, parse_fields: List[str])
Inherited Members
Parser
normalize
get_topics
@dataclasses.dataclass(init=True)
class XMLParser(Parser):
 66@dataclasses.dataclass(init=True)
 67class XMLParser(Parser):
 68    """
 69    Load topics from an XML file
 70    """
 71    topic_field_name: str
 72    id_field: str
 73    parse_fields: List[str]
 74
 75    @classmethod
 76    def _recurse_to_child_node(cls, node: ET.Element, track: List):
 77        """
 78        Helper method to get all children nodes for text extraction in an xml.
 79
 80        :param node: Current node
 81        :param track: List to track nodes
 82        :return:
 83        """
 84        if len(node.getchildren()) > 0:
 85            for child in node.getchildren():
 86                track.append(cls._recurse_to_child_node(child, track))
 87
 88        return node
 89
 90    @classmethod
 91    def unwrap(cls, doc_dict, key):
 92        """
 93        Converts defaultdict to dict and list of size 1 to just the element
 94
 95        :param doc_dict:
 96        :param key:
 97        """
 98        if isinstance(doc_dict[key], defaultdict):
 99            doc_dict[key] = dict(doc_dict[key])
100
101            for e_key in doc_dict[key]:
102                cls.unwrap(doc_dict[key], e_key)
103
104        if isinstance(doc_dict[key], list):
105            if len(doc_dict[key]) == 1:
106                doc_dict[key] = doc_dict[key][0]
107
108    def _get_topics(self, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
109        all_topics = ET.parse(path).getroot()
110        qtopics = {}
111
112        for topic in all_topics.findall(self.topic_field_name):
113            _id = topic.attrib[self.id_field]
114            if _id.isnumeric():
115                _id = int(_id)
116
117            if self.parse_fields:
118                temp = {}
119                for field in self.parse_fields:
120                    try:
121                        temp[field] = topic.find(field).text.strip()
122                    except:
123                        continue
124
125                qtopics[_id] = temp
126            else:
127                #  The topic contains the text
128                qtopics[_id] = {"query": topic.text.strip()}
129
130        return qtopics

Load topics from an XML file

XMLParser(id_field: str, parse_fields: List[str], topic_field_name: str)
@classmethod
def unwrap(cls, doc_dict, key):
 90    @classmethod
 91    def unwrap(cls, doc_dict, key):
 92        """
 93        Converts defaultdict to dict and list of size 1 to just the element
 94
 95        :param doc_dict:
 96        :param key:
 97        """
 98        if isinstance(doc_dict[key], defaultdict):
 99            doc_dict[key] = dict(doc_dict[key])
100
101            for e_key in doc_dict[key]:
102                cls.unwrap(doc_dict[key], e_key)
103
104        if isinstance(doc_dict[key], list):
105            if len(doc_dict[key]) == 1:
106                doc_dict[key] = doc_dict[key][0]

Converts defaultdict to dict and list of size 1 to just the element

Parameters
  • doc_dict:
  • key:
Inherited Members
Parser
normalize
get_topics
@dataclasses.dataclass
class CSVParser(Parser):
133@dataclasses.dataclass
134class CSVParser(Parser):
135    """
136    Loads topics from a CSV file
137    """
138    id_field = "id"
139    parse_fields = ["Text"]
140
141    def __init__(self, id_field=None, parse_fields=None):
142        if parse_fields is None:
143            parse_fields = ["id", "text"]
144
145        if id_field is None:
146            id_field = "id"
147
148        super().__init__(id_field, parse_fields)
149
150    @classmethod
151    def _get_topics(cls, csvfile, dialect="excel",
152                    id_field: str = None,
153                    parse_fields: List[str] = None,
154                    *args, **kwargs) -> Dict[int, Dict[str, str]]:
155        topics = {}
156
157        if isinstance(csvfile, str):
158            csvfile = open(csvfile, 'rt')
159
160        if id_field is None:
161            id_field = cls.id_field
162
163        if parse_fields is None:
164            parse_fields = cls.parse_fields
165
166        reader = csv.DictReader(csvfile, dialect=dialect)
167        for row in reader:
168            temp = {}
169
170            for field in parse_fields:
171                temp[field] = row[field]
172
173            topics[row[id_field]] = temp
174
175        return topics

Loads topics from a CSV file

CSVParser(id_field=None, parse_fields=None)
141    def __init__(self, id_field=None, parse_fields=None):
142        if parse_fields is None:
143            parse_fields = ["id", "text"]
144
145        if id_field is None:
146            id_field = "id"
147
148        super().__init__(id_field, parse_fields)
Inherited Members
Parser
normalize
get_topics
@dataclasses.dataclass(init=True)
class TSVParser(CSVParser):
178@dataclasses.dataclass(init=True)
179class TSVParser(CSVParser):
180
181    @classmethod
182    def _get_topics(cls, tsvfile, *args, **kwargs) -> Dict[int, Dict[str, str]]:
183        return CSVParser._get_topics(tsvfile, *args, dialect='excel-tab', **kwargs)
TSVParser(id_field: object, parse_fields: List[str])
Inherited Members
Parser
normalize
get_topics
@dataclasses.dataclass(init=True)
class JsonLinesParser(Parser):
186@dataclasses.dataclass(init=True)
187class JsonLinesParser(Parser):
188    """
189    Loads topics from a jsonl file,
190    a JSON per line
191
192    Provide parse_fields, id_field and whether to ignore full matches on json keys
193    secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.
194    """
195    parse_fields: List[str]
196    id_field: str
197    ignore_full_match: bool = True
198    secondary_id: str = None
199
200    @classmethod
201    def _get_topics(cls, jsonlfile, id_field, parse_fields,
202                    ignore_full_match=True, secondary_id=None, *args, **kwargs) -> Dict[str, Dict]:
203        with open(jsonlfile, "r") as jsonl_f:
204            topics = {}
205
206            for jsonl in jsonl_f:
207                json_dict = json.loads(jsonl)
208                _id = json_dict.pop(id_field)
209
210                if secondary_id:
211                    _id = str(_id) + "_" + str(json_dict[secondary_id])
212
213                for key in list(json_dict.keys()):
214                    found = False
215                    for _key in parse_fields:
216                        if ignore_full_match:
217                            if key in _key or key == _key or _key in key:
218                                found = True
219                        else:
220                            if _key == key:
221                                found = True
222                    if not found:
223                        json_dict.pop(key)
224
225                topics[_id] = json_dict
226
227        return topics

Loads topics from a jsonl file, a JSON per line

Provide parse_fields, id_field and whether to ignore full matches on json keys secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.

JsonLinesParser( id_field: str, parse_fields: List[str], ignore_full_match: bool = True, secondary_id: str = None)
Inherited Members
Parser
normalize
get_topics