debeir.datasets.bioreddit

 1from typing import Dict
 2
 3from debeir.core.parser import CSVParser
 4from debeir.core.query import GenericElasticsearchQuery
 5
 6
 7class BioRedditSubmissionParser(CSVParser):
 8    """
 9    Parser for the BioReddit Submission Dataset
10    """
11    parse_fields = ["id", "body"]
12
13    @classmethod
14    def get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]:
15        return super().get_topics(csvfile)
16
17
18class BioRedditCommentParser(CSVParser):
19    """
20    Parser for the BioReddit Comment Dataset
21    """
22    parse_fields = ["id", "parent_id", "selftext", "title"]
23
24    @classmethod
25    def get_topics(cls, csvfile) -> Dict[str, Dict[str, str]]:
26        topics = super().get_topics(csvfile)
27        temp = {}
28
29        for _, topic in topics.items():
30            topic["text"] = topic.pop("selftext")
31            topic["text2"] = topic.pop("title")
32            temp[topic["id"]] = topic
33
34        return temp
35
36
37class BioRedditElasticsearchQuery(GenericElasticsearchQuery):
38    """
39    Elasticsearch Query object for the BioReddit
40    """
41
42    def __init__(self, topics, config, *args, **kwargs):
43        super().__init__(topics, config, *args, **kwargs)
44        self.mappings = ["Text"]
45
46        self.topics = topics
47        self.config = config
48        self.query_type = self.config.query_type
49
50        self.embed_mappings = ["Text_Embedding"]
51
52        self.query_funcs = {
53            "query": self.generate_query,
54            "embedding": self.generate_query_embedding,
55        }
class BioRedditSubmissionParser(debeir.core.parser.CSVParser):
 8class BioRedditSubmissionParser(CSVParser):
 9    """
10    Parser for the BioReddit Submission Dataset
11    """
12    parse_fields = ["id", "body"]
13
14    @classmethod
15    def get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]:
16        return super().get_topics(csvfile)

Parser for the BioReddit Submission Dataset

@classmethod
def get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]:
14    @classmethod
15    def get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]:
16        return super().get_topics(csvfile)

Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

class BioRedditCommentParser(debeir.core.parser.CSVParser):
19class BioRedditCommentParser(CSVParser):
20    """
21    Parser for the BioReddit Comment Dataset
22    """
23    parse_fields = ["id", "parent_id", "selftext", "title"]
24
25    @classmethod
26    def get_topics(cls, csvfile) -> Dict[str, Dict[str, str]]:
27        topics = super().get_topics(csvfile)
28        temp = {}
29
30        for _, topic in topics.items():
31            topic["text"] = topic.pop("selftext")
32            topic["text2"] = topic.pop("title")
33            temp[topic["id"]] = topic
34
35        return temp

Parser for the BioReddit Comment Dataset

@classmethod
def get_topics(cls, csvfile) -> Dict[str, Dict[str, str]]:
25    @classmethod
26    def get_topics(cls, csvfile) -> Dict[str, Dict[str, str]]:
27        topics = super().get_topics(csvfile)
28        temp = {}
29
30        for _, topic in topics.items():
31            topic["text"] = topic.pop("selftext")
32            topic["text2"] = topic.pop("title")
33            temp[topic["id"]] = topic
34
35        return temp

Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

class BioRedditElasticsearchQuery(debeir.core.query.GenericElasticsearchQuery):
38class BioRedditElasticsearchQuery(GenericElasticsearchQuery):
39    """
40    Elasticsearch Query object for the BioReddit
41    """
42
43    def __init__(self, topics, config, *args, **kwargs):
44        super().__init__(topics, config, *args, **kwargs)
45        self.mappings = ["Text"]
46
47        self.topics = topics
48        self.config = config
49        self.query_type = self.config.query_type
50
51        self.embed_mappings = ["Text_Embedding"]
52
53        self.query_funcs = {
54            "query": self.generate_query,
55            "embedding": self.generate_query_embedding,
56        }

Elasticsearch Query object for the BioReddit

BioRedditElasticsearchQuery(topics, config, *args, **kwargs)
43    def __init__(self, topics, config, *args, **kwargs):
44        super().__init__(topics, config, *args, **kwargs)
45        self.mappings = ["Text"]
46
47        self.topics = topics
48        self.config = config
49        self.query_type = self.config.query_type
50
51        self.embed_mappings = ["Text_Embedding"]
52
53        self.query_funcs = {
54            "query": self.generate_query,
55            "embedding": self.generate_query_embedding,
56        }