debeir.core.converters

 1from collections import defaultdict
 2from typing import Dict, Union
 3
 4from debeir.core.parser import Parser
 5
 6import datasets
 7
 8
 9class ParsedTopicsToDataset:
10    """
11    Converts a parser's output to a huggingface dataset object.
12    """
13
14    @classmethod
15    def convert(cls, parser: Parser, output: Dict[Union[str, int], Dict]):
16        """
17        Flatten a Dict of shape (traditional parser output)
18        {topic_id: {
19                "Facet_1": ...
20                "Facet_2": ...
21            }
22        }
23
24        ->
25
26        To a flattened arrow-like dataset.
27        {
28        topic_ids: [],
29        Facet_1s: [],
30        Facet_2s: [],
31        }
32
33        :param output: Topics output from the parser object
34        :return:
35        """
36        flattened_topics = defaultdict(lambda: [])
37
38        for topic_id, topic in output.items():
39            flattened_topics["topic_id"].append(topic_id)
40
41            for field in parser.parse_fields:
42                if field in topic:
43                    flattened_topics[field].append(topic[field])
44                else:
45                    flattened_topics[field].append(None)
46
47        return datasets.Dataset.from_dict(flattened_topics)
class ParsedTopicsToDataset:
10class ParsedTopicsToDataset:
11    """
12    Converts a parser's output to a huggingface dataset object.
13    """
14
15    @classmethod
16    def convert(cls, parser: Parser, output: Dict[Union[str, int], Dict]):
17        """
18        Flatten a Dict of shape (traditional parser output)
19        {topic_id: {
20                "Facet_1": ...
21                "Facet_2": ...
22            }
23        }
24
25        ->
26
27        To a flattened arrow-like dataset.
28        {
29        topic_ids: [],
30        Facet_1s: [],
31        Facet_2s: [],
32        }
33
34        :param output: Topics output from the parser object
35        :return:
36        """
37        flattened_topics = defaultdict(lambda: [])
38
39        for topic_id, topic in output.items():
40            flattened_topics["topic_id"].append(topic_id)
41
42            for field in parser.parse_fields:
43                if field in topic:
44                    flattened_topics[field].append(topic[field])
45                else:
46                    flattened_topics[field].append(None)
47
48        return datasets.Dataset.from_dict(flattened_topics)

Converts a parser's output to a huggingface dataset object.

ParsedTopicsToDataset()
@classmethod
def convert( cls, parser: debeir.core.parser.Parser, output: Dict[Union[str, int], Dict]):
15    @classmethod
16    def convert(cls, parser: Parser, output: Dict[Union[str, int], Dict]):
17        """
18        Flatten a Dict of shape (traditional parser output)
19        {topic_id: {
20                "Facet_1": ...
21                "Facet_2": ...
22            }
23        }
24
25        ->
26
27        To a flattened arrow-like dataset.
28        {
29        topic_ids: [],
30        Facet_1s: [],
31        Facet_2s: [],
32        }
33
34        :param output: Topics output from the parser object
35        :return:
36        """
37        flattened_topics = defaultdict(lambda: [])
38
39        for topic_id, topic in output.items():
40            flattened_topics["topic_id"].append(topic_id)
41
42            for field in parser.parse_fields:
43                if field in topic:
44                    flattened_topics[field].append(topic[field])
45                else:
46                    flattened_topics[field].append(None)
47
48        return datasets.Dataset.from_dict(flattened_topics)

Flatten a Dict of shape (traditional parser output) {topic_id: { "Facet_1": ... "Facet_2": ... } }

->

To a flattened arrow-like dataset. { topic_ids: [], Facet_1s: [], Facet_2s: [], }

Parameters
  • output: Topics output from the parser object
Returns