debeir.datasets.trec_clinical_trials

  1import pathlib
  2import xml.etree.ElementTree as ET
  3from collections import defaultdict
  4from typing import Dict, List
  5
  6import pandas as pd
  7from debeir.core.parser import JsonLinesParser, XMLParser
  8from debeir.core.query import GenericElasticsearchQuery
  9
 10
 11class TREClinicalTrialDocumentParser(XMLParser):
 12    """
 13    Parser for Clinical Trials topics
 14    """
 15
 16    parse_fields: List[str] = ["brief_title", "official_title",
 17                               "brief_summary", "detailed_description",
 18                               "eligibility", "condition_browse",
 19                               "intervention_browse"]
 20    topic_field_name: str
 21    id_field: str
 22
 23    @classmethod
 24    def extract(cls, path) -> Dict:
 25        document = ET.parse(path).getroot()
 26        document_dict = defaultdict(lambda: defaultdict(lambda: []))
 27        document_dict['doc_id'] = pathlib.Path(path).parts[-1].strip(".xml")
 28
 29        for parse_field in cls.parse_fields:
 30            node = document.find(parse_field)
 31            nodes: List[ET.Element] = []
 32
 33            if node is not None:
 34                cls._recurse_to_child_node(node, nodes)
 35
 36            if len(nodes) == 0 and node is not None:
 37                document_dict[parse_field] = node.text
 38
 39            for node in nodes:
 40                text = node.text.strip()
 41
 42                if not text:
 43                    continue
 44
 45                if document_dict[parse_field][node.tag]:
 46                    document_dict[parse_field][node.tag].append(text)
 47                else:
 48                    document_dict[parse_field][node.tag] = [text]
 49
 50            cls.unwrap(document_dict, parse_field)
 51
 52        document_dict = pd.io.json.json_normalize(document_dict,
 53                                                  sep=".").to_dict(orient='records')[0]
 54
 55        return document_dict
 56
 57
 58TrecClinicalTrialTripletParser = JsonLinesParser(
 59    parse_fields=["q_text", "brief_title", "official_title",
 60                  "brief_summary", "detailed_description", "rel"],
 61    id_field="qid",
 62    secondary_id="doc_id",
 63    ignore_full_match=True
 64)
 65
 66TrecClinicalTrialsParser = XMLParser(
 67    parse_fields=None,
 68    id_field="number",
 69    topic_field_name="topic")
 70
 71
 72class TrecClincialElasticsearchQuery(GenericElasticsearchQuery):
 73    def __init__(self, topics, config, *args, **kwargs):
 74        super().__init__(topics, config, *args, **kwargs)
 75
 76        # self.mappings = ['BriefTitle_Text',
 77        #                 'BriefSummary_Text',
 78        #                 'DetailedDescription_Text']
 79
 80        self.mappings = [
 81            "BriefSummary_Text",
 82            "BriefTitle_Text",
 83            'DetailedDescription_Text',
 84            'Eligibility.Criteria.Textblock'
 85            'Eligibility.StudyPop.Textblock',
 86            'ConditionBrowse.MeshTerm',
 87            'InterventionBrowse.MeshTerm',
 88            'Condition',
 89            'Eligibility.Gender',
 90            "OfficialTitle"]
 91
 92        self.topics = topics
 93        self.config = config
 94        self.query_type = self.config.query_type
 95
 96        self.embed_mappings = ['BriefTitle_Embedding',
 97                               'BriefSummary_Embedding',
 98                               'DetailedDescription_Embedding']
 99
100        self.id_mapping = "docid"
101
102        self.query_funcs = {
103            "query": self.generate_query,
104            "embedding": self.generate_query_embedding,
105        }
class TREClinicalTrialDocumentParser(debeir.core.parser.XMLParser):
12class TREClinicalTrialDocumentParser(XMLParser):
13    """
14    Parser for Clinical Trials topics
15    """
16
17    parse_fields: List[str] = ["brief_title", "official_title",
18                               "brief_summary", "detailed_description",
19                               "eligibility", "condition_browse",
20                               "intervention_browse"]
21    topic_field_name: str
22    id_field: str
23
24    @classmethod
25    def extract(cls, path) -> Dict:
26        document = ET.parse(path).getroot()
27        document_dict = defaultdict(lambda: defaultdict(lambda: []))
28        document_dict['doc_id'] = pathlib.Path(path).parts[-1].strip(".xml")
29
30        for parse_field in cls.parse_fields:
31            node = document.find(parse_field)
32            nodes: List[ET.Element] = []
33
34            if node is not None:
35                cls._recurse_to_child_node(node, nodes)
36
37            if len(nodes) == 0 and node is not None:
38                document_dict[parse_field] = node.text
39
40            for node in nodes:
41                text = node.text.strip()
42
43                if not text:
44                    continue
45
46                if document_dict[parse_field][node.tag]:
47                    document_dict[parse_field][node.tag].append(text)
48                else:
49                    document_dict[parse_field][node.tag] = [text]
50
51            cls.unwrap(document_dict, parse_field)
52
53        document_dict = pd.io.json.json_normalize(document_dict,
54                                                  sep=".").to_dict(orient='records')[0]
55
56        return document_dict

Parser for Clinical Trials topics

@classmethod
def extract(cls, path) -> Dict:
24    @classmethod
25    def extract(cls, path) -> Dict:
26        document = ET.parse(path).getroot()
27        document_dict = defaultdict(lambda: defaultdict(lambda: []))
28        document_dict['doc_id'] = pathlib.Path(path).parts[-1].strip(".xml")
29
30        for parse_field in cls.parse_fields:
31            node = document.find(parse_field)
32            nodes: List[ET.Element] = []
33
34            if node is not None:
35                cls._recurse_to_child_node(node, nodes)
36
37            if len(nodes) == 0 and node is not None:
38                document_dict[parse_field] = node.text
39
40            for node in nodes:
41                text = node.text.strip()
42
43                if not text:
44                    continue
45
46                if document_dict[parse_field][node.tag]:
47                    document_dict[parse_field][node.tag].append(text)
48                else:
49                    document_dict[parse_field][node.tag] = [text]
50
51            cls.unwrap(document_dict, parse_field)
52
53        document_dict = pd.io.json.json_normalize(document_dict,
54                                                  sep=".").to_dict(orient='records')[0]
55
56        return document_dict
class TrecClincialElasticsearchQuery(debeir.core.query.GenericElasticsearchQuery):
 73class TrecClincialElasticsearchQuery(GenericElasticsearchQuery):
 74    def __init__(self, topics, config, *args, **kwargs):
 75        super().__init__(topics, config, *args, **kwargs)
 76
 77        # self.mappings = ['BriefTitle_Text',
 78        #                 'BriefSummary_Text',
 79        #                 'DetailedDescription_Text']
 80
 81        self.mappings = [
 82            "BriefSummary_Text",
 83            "BriefTitle_Text",
 84            'DetailedDescription_Text',
 85            'Eligibility.Criteria.Textblock'
 86            'Eligibility.StudyPop.Textblock',
 87            'ConditionBrowse.MeshTerm',
 88            'InterventionBrowse.MeshTerm',
 89            'Condition',
 90            'Eligibility.Gender',
 91            "OfficialTitle"]
 92
 93        self.topics = topics
 94        self.config = config
 95        self.query_type = self.config.query_type
 96
 97        self.embed_mappings = ['BriefTitle_Embedding',
 98                               'BriefSummary_Embedding',
 99                               'DetailedDescription_Embedding']
100
101        self.id_mapping = "docid"
102
103        self.query_funcs = {
104            "query": self.generate_query,
105            "embedding": self.generate_query_embedding,
106        }

A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries. Requires topics, configs to be included

TrecClincialElasticsearchQuery(topics, config, *args, **kwargs)
 74    def __init__(self, topics, config, *args, **kwargs):
 75        super().__init__(topics, config, *args, **kwargs)
 76
 77        # self.mappings = ['BriefTitle_Text',
 78        #                 'BriefSummary_Text',
 79        #                 'DetailedDescription_Text']
 80
 81        self.mappings = [
 82            "BriefSummary_Text",
 83            "BriefTitle_Text",
 84            'DetailedDescription_Text',
 85            'Eligibility.Criteria.Textblock'
 86            'Eligibility.StudyPop.Textblock',
 87            'ConditionBrowse.MeshTerm',
 88            'InterventionBrowse.MeshTerm',
 89            'Condition',
 90            'Eligibility.Gender',
 91            "OfficialTitle"]
 92
 93        self.topics = topics
 94        self.config = config
 95        self.query_type = self.config.query_type
 96
 97        self.embed_mappings = ['BriefTitle_Embedding',
 98                               'BriefSummary_Embedding',
 99                               'DetailedDescription_Embedding']
100
101        self.id_mapping = "docid"
102
103        self.query_funcs = {
104            "query": self.generate_query,
105            "embedding": self.generate_query_embedding,
106        }