debeir.datasets.trec_clinical_trials
1import pathlib 2import xml.etree.ElementTree as ET 3from collections import defaultdict 4from typing import Dict, List 5 6import pandas as pd 7from debeir.core.parser import JsonLinesParser, XMLParser 8from debeir.core.query import GenericElasticsearchQuery 9 10 11class TREClinicalTrialDocumentParser(XMLParser): 12 """ 13 Parser for Clinical Trials topics 14 """ 15 16 parse_fields: List[str] = ["brief_title", "official_title", 17 "brief_summary", "detailed_description", 18 "eligibility", "condition_browse", 19 "intervention_browse"] 20 topic_field_name: str 21 id_field: str 22 23 @classmethod 24 def extract(cls, path) -> Dict: 25 document = ET.parse(path).getroot() 26 document_dict = defaultdict(lambda: defaultdict(lambda: [])) 27 document_dict['doc_id'] = pathlib.Path(path).parts[-1].strip(".xml") 28 29 for parse_field in cls.parse_fields: 30 node = document.find(parse_field) 31 nodes: List[ET.Element] = [] 32 33 if node is not None: 34 cls._recurse_to_child_node(node, nodes) 35 36 if len(nodes) == 0 and node is not None: 37 document_dict[parse_field] = node.text 38 39 for node in nodes: 40 text = node.text.strip() 41 42 if not text: 43 continue 44 45 if document_dict[parse_field][node.tag]: 46 document_dict[parse_field][node.tag].append(text) 47 else: 48 document_dict[parse_field][node.tag] = [text] 49 50 cls.unwrap(document_dict, parse_field) 51 52 document_dict = pd.io.json.json_normalize(document_dict, 53 sep=".").to_dict(orient='records')[0] 54 55 return document_dict 56 57 58TrecClinicalTrialTripletParser = JsonLinesParser( 59 parse_fields=["q_text", "brief_title", "official_title", 60 "brief_summary", "detailed_description", "rel"], 61 id_field="qid", 62 secondary_id="doc_id", 63 ignore_full_match=True 64) 65 66TrecClinicalTrialsParser = XMLParser( 67 parse_fields=None, 68 id_field="number", 69 topic_field_name="topic") 70 71 72class TrecClincialElasticsearchQuery(GenericElasticsearchQuery): 73 def __init__(self, topics, config, *args, **kwargs): 74 super().__init__(topics, config, *args, **kwargs) 75 76 # self.mappings = ['BriefTitle_Text', 77 # 'BriefSummary_Text', 78 # 'DetailedDescription_Text'] 79 80 self.mappings = [ 81 "BriefSummary_Text", 82 "BriefTitle_Text", 83 'DetailedDescription_Text', 84 'Eligibility.Criteria.Textblock' 85 'Eligibility.StudyPop.Textblock', 86 'ConditionBrowse.MeshTerm', 87 'InterventionBrowse.MeshTerm', 88 'Condition', 89 'Eligibility.Gender', 90 "OfficialTitle"] 91 92 self.topics = topics 93 self.config = config 94 self.query_type = self.config.query_type 95 96 self.embed_mappings = ['BriefTitle_Embedding', 97 'BriefSummary_Embedding', 98 'DetailedDescription_Embedding'] 99 100 self.id_mapping = "docid" 101 102 self.query_funcs = { 103 "query": self.generate_query, 104 "embedding": self.generate_query_embedding, 105 }
12class TREClinicalTrialDocumentParser(XMLParser): 13 """ 14 Parser for Clinical Trials topics 15 """ 16 17 parse_fields: List[str] = ["brief_title", "official_title", 18 "brief_summary", "detailed_description", 19 "eligibility", "condition_browse", 20 "intervention_browse"] 21 topic_field_name: str 22 id_field: str 23 24 @classmethod 25 def extract(cls, path) -> Dict: 26 document = ET.parse(path).getroot() 27 document_dict = defaultdict(lambda: defaultdict(lambda: [])) 28 document_dict['doc_id'] = pathlib.Path(path).parts[-1].strip(".xml") 29 30 for parse_field in cls.parse_fields: 31 node = document.find(parse_field) 32 nodes: List[ET.Element] = [] 33 34 if node is not None: 35 cls._recurse_to_child_node(node, nodes) 36 37 if len(nodes) == 0 and node is not None: 38 document_dict[parse_field] = node.text 39 40 for node in nodes: 41 text = node.text.strip() 42 43 if not text: 44 continue 45 46 if document_dict[parse_field][node.tag]: 47 document_dict[parse_field][node.tag].append(text) 48 else: 49 document_dict[parse_field][node.tag] = [text] 50 51 cls.unwrap(document_dict, parse_field) 52 53 document_dict = pd.io.json.json_normalize(document_dict, 54 sep=".").to_dict(orient='records')[0] 55 56 return document_dict
Parser for Clinical Trials topics
@classmethod
def
extract(cls, path) -> Dict:
24 @classmethod 25 def extract(cls, path) -> Dict: 26 document = ET.parse(path).getroot() 27 document_dict = defaultdict(lambda: defaultdict(lambda: [])) 28 document_dict['doc_id'] = pathlib.Path(path).parts[-1].strip(".xml") 29 30 for parse_field in cls.parse_fields: 31 node = document.find(parse_field) 32 nodes: List[ET.Element] = [] 33 34 if node is not None: 35 cls._recurse_to_child_node(node, nodes) 36 37 if len(nodes) == 0 and node is not None: 38 document_dict[parse_field] = node.text 39 40 for node in nodes: 41 text = node.text.strip() 42 43 if not text: 44 continue 45 46 if document_dict[parse_field][node.tag]: 47 document_dict[parse_field][node.tag].append(text) 48 else: 49 document_dict[parse_field][node.tag] = [text] 50 51 cls.unwrap(document_dict, parse_field) 52 53 document_dict = pd.io.json.json_normalize(document_dict, 54 sep=".").to_dict(orient='records')[0] 55 56 return document_dict
Inherited Members
73class TrecClincialElasticsearchQuery(GenericElasticsearchQuery): 74 def __init__(self, topics, config, *args, **kwargs): 75 super().__init__(topics, config, *args, **kwargs) 76 77 # self.mappings = ['BriefTitle_Text', 78 # 'BriefSummary_Text', 79 # 'DetailedDescription_Text'] 80 81 self.mappings = [ 82 "BriefSummary_Text", 83 "BriefTitle_Text", 84 'DetailedDescription_Text', 85 'Eligibility.Criteria.Textblock' 86 'Eligibility.StudyPop.Textblock', 87 'ConditionBrowse.MeshTerm', 88 'InterventionBrowse.MeshTerm', 89 'Condition', 90 'Eligibility.Gender', 91 "OfficialTitle"] 92 93 self.topics = topics 94 self.config = config 95 self.query_type = self.config.query_type 96 97 self.embed_mappings = ['BriefTitle_Embedding', 98 'BriefSummary_Embedding', 99 'DetailedDescription_Embedding'] 100 101 self.id_mapping = "docid" 102 103 self.query_funcs = { 104 "query": self.generate_query, 105 "embedding": self.generate_query_embedding, 106 }
A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries. Requires topics, configs to be included
TrecClincialElasticsearchQuery(topics, config, *args, **kwargs)
74 def __init__(self, topics, config, *args, **kwargs): 75 super().__init__(topics, config, *args, **kwargs) 76 77 # self.mappings = ['BriefTitle_Text', 78 # 'BriefSummary_Text', 79 # 'DetailedDescription_Text'] 80 81 self.mappings = [ 82 "BriefSummary_Text", 83 "BriefTitle_Text", 84 'DetailedDescription_Text', 85 'Eligibility.Criteria.Textblock' 86 'Eligibility.StudyPop.Textblock', 87 'ConditionBrowse.MeshTerm', 88 'InterventionBrowse.MeshTerm', 89 'Condition', 90 'Eligibility.Gender', 91 "OfficialTitle"] 92 93 self.topics = topics 94 self.config = config 95 self.query_type = self.config.query_type 96 97 self.embed_mappings = ['BriefTitle_Embedding', 98 'BriefSummary_Embedding', 99 'DetailedDescription_Embedding'] 100 101 self.id_mapping = "docid" 102 103 self.query_funcs = { 104 "query": self.generate_query, 105 "embedding": self.generate_query_embedding, 106 }