Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/bin/env python 

2# encoding: utf-8 

3""" 

4*Tools to download, parse and ingest astronote content into a MySQL database* 

5 

6:Author: 

7 David Young 

8 

9:Date Created: 

10 January 15, 2021 

11""" 

12from builtins import object 

13import sys 

14import re 

15import os 

16os.environ['TERM'] = 'vt100' 

17from fundamentals import tools 

18import requests 

19import json 

20from pprint import pprint 

21from fundamentals.mysql import insert_list_of_dictionaries_into_database_tables 

22from fundamentals.mysql import convert_dictionary_to_mysql_table 

23from fundamentals.mysql import readquery 

24import time 

25import codecs 

26from bs4 import BeautifulSoup 

27 

28 

29class astronotes(object): 

30 """ 

31 *Tools to download, parse and ingest astronote content into a MySQL database* 

32 

33 **Key Arguments:** 

34 - ``log`` -- logger 

35 - ``dbConn`` -- database connection. Default *False* 

36 - ``settings`` -- the settings dictionary 

37 

38 **Usage:** 

39 

40 To setup your logger, settings and database connections, please use the ``fundamentals`` package (`see tutorial here <http://fundamentals.readthedocs.io/en/latest/#tutorial>`_).  

41 

42 To initiate a astronotes object, use the following: 

43 

44 ```python 

45 from transientNamer import astronotes 

46 an = astronotes( 

47 log=log, 

48 dbConn=dbConn, 

49 settings=settings 

50 ) 

51 ``` 

52 """ 

53 

54 def __init__( 

55 self, 

56 log, 

57 dbConn=False, 

58 settings=False, 

59 

60 ): 

61 self.log = log 

62 log.debug("instansiating a new 'astronotes' object") 

63 self.settings = settings 

64 self.dbConn = dbConn 

65 

66 return None 

67 

68 def download( 

69 self, 

70 cache_dir, 

71 inLastDays=False): 

72 """*Download astronotes reported in the lasy N days. Check cache for notes alreaedy downloaded.* 

73 

74 **Key Arguments:** 

75 - `cache_dir` -- the directory to cache the json notes to. 

76 - `inLastDays` -- download only notes reported in the last N days. Default *False*. (Download all) 

77 

78 **Return:** 

79 - `downloadCount` -- number of new files cached 

80 

81 **Usage:** 

82 

83 ```python 

84 from transientNamer import astronotes 

85 an = astronotes( 

86 log=log, 

87 dbConn=dbConn, 

88 settings=settings 

89 ) 

90 downloadCount = an.download( 

91 cache_dir=settings["astronote-cache"], inLastDays=30) 

92 print(f"{downloadCount} new astronotes downloaded anc cached") 

93 ``` 

94 """ 

95 self.log.debug('starting the ``download`` method') 

96 

97 paginationSets = 50 

98 page = 0 

99 allNotes = {} 

100 noteCount = paginationSets + 1 

101 if not inLastDays: 

102 inLastDays = 30000 

103 

104 # PAGINATE THROUGH RESULTS UNTIL WE HIT THE END 

105 while noteCount >= paginationSets: 

106 try: 

107 response = requests.get( 

108 url="https://www.wis-tns.org/astronotes", 

109 params={ 

110 "posted_period_value": inLastDays, 

111 "posted_period_units": "days", 

112 "num_page": paginationSets, 

113 "page": page, 

114 "format": "json" 

115 }, 

116 headers={ 

117 'User-Agent': self.settings["user-agent"] 

118 } 

119 ) 

120 searchPage = response.content.decode("utf-8") 

121 except requests.exceptions.RequestException: 

122 print('HTTP Request failed') 

123 page += 1 

124 data = json.loads(searchPage) 

125 noteCount = len(data) 

126 if noteCount: 

127 allNotes = dict(list(allNotes.items()) + list(data.items())) 

128 

129 # RECURSIVELY CREATE MISSING DIRECTORIES FOR CACHE 

130 if not os.path.exists(self.settings["astronote-cache"]): 

131 os.makedirs(self.settings["astronote-cache"]) 

132 

133 # DOWNLOAD ONE NOTE PER FILE 

134 downloadCount = 0 

135 noteIds = [] 

136 for k, v in allNotes.items(): 

137 noteIds.append(k) 

138 filepath = self.settings["astronote-cache"] + f"/{k}.json" 

139 vJson = json.dumps( 

140 v, 

141 separators=(',', ': '), 

142 sort_keys=True, 

143 indent=4 

144 ) 

145 if not os.path.exists(filepath): 

146 myFile = open(filepath, 'w') 

147 myFile.write(vJson) 

148 myFile.close() 

149 downloadCount += 1 

150 

151 # NOW DOWNLOAD REQUIRED HTML NOTES 

152 for n in noteIds: 

153 filepath = self.settings["astronote-cache"] + f"/{n}.html" 

154 if not os.path.exists(filepath): 

155 time.sleep(1) 

156 try: 

157 response = requests.get( 

158 url=f"https://www.wis-tns.org/astronotes/astronote/{n}", 

159 headers={ 

160 'User-Agent': self.settings["user-agent"] 

161 } 

162 

163 ) 

164 noteContent = response.content.decode("utf-8") 

165 except requests.exceptions.RequestException: 

166 print('HTTP Request failed') 

167 myFile = open(filepath, 'w') 

168 myFile.write(noteContent) 

169 myFile.close() 

170 

171 self.log.debug('completed the ``download`` method') 

172 return downloadCount 

173 

174 def get_all_noteids( 

175 self, 

176 inLastDays=False): 

177 """*get the noteids of those notes released in the last N days* 

178 

179 **Key Arguments:** 

180 - ``inLastDays`` -- report only notesIds released in the last N days. Default *False*. (Report all) 

181 

182 **Return:** 

183 - `noteIds` -- list of all reported noteIds 

184 

185 **Usage:** 

186 

187 ```python 

188 from transientNamer import astronotes 

189 an = astronotes( 

190 log=log, 

191 settings=settings 

192 ) 

193 noteIds = an.get_all_noteid(inLastDays=3000) 

194 print(f"Astronote IDs: {noteIds}") 

195 ``` 

196 """ 

197 self.log.debug('starting the ``get_all_noteids`` method') 

198 

199 paginationSets = 100 

200 page = 0 

201 noteIds = [] 

202 noteCount = paginationSets + 1 

203 if not inLastDays: 

204 inLastDays = 30000 

205 

206 # PAGINATE THROUGH RESULTS UNTIL WE HIT THE END 

207 while noteCount >= paginationSets: 

208 try: 

209 response = requests.get( 

210 url="https://www.wis-tns.org/astronotes", 

211 params={ 

212 "posted_period_value": inLastDays, 

213 "posted_period_units": "days", 

214 "num_page": paginationSets, 

215 "page": page 

216 }, 

217 headers={ 

218 'User-Agent': self.settings["user-agent"] 

219 } 

220 ) 

221 searchPage = response.content.decode("utf-8") 

222 except requests.exceptions.RequestException: 

223 print('HTTP Request failed') 

224 page += 1 

225 

226 # GET NOTES WRAPPER 

227 getpage_soup = BeautifulSoup(searchPage, 'html.parser') 

228 noteswrapper = getpage_soup.find( 

229 'div', {'id': 'notes-wrapper'}) 

230 

231 # NOW GET INDIVIDUAL NOTES 

232 notelinks = noteswrapper.findAll('a', {'class': 'note-link'}) 

233 noteCount = len(notelinks) 

234 

235 for link in notelinks: 

236 noteId = link.attrs['href'].split("/astronote/")[-1] 

237 noteIds.append(noteId) 

238 

239 self.log.debug('completed the ``get_all_noteids`` method') 

240 return noteIds 

241 

242 def notes_to_database( 

243 self): 

244 """*read the notes and import them into indexed MySQL database tables* 

245 

246 **Usage:** 

247 

248 ```python 

249 from transientNamer import astronotes 

250 an = astronotes( 

251 log=log, 

252 dbConn=dbConn, 

253 settings=settings 

254 ) 

255 an.notes_to_database() 

256 ``` 

257 """ 

258 self.log.debug('starting the ``notes_to_database`` method') 

259 

260 # CREATE THE DATABASE TABLES IF THEY DON'T EXIST 

261 self._create_db_tables() 

262 

263 # WHICH ASTRONOTES ARE ALREADY IN THE DATABASE 

264 sqlQuery = u""" 

265 select astronote FROM astronotes_content; 

266 """ % locals() 

267 rows = readquery( 

268 log=self.log, 

269 sqlQuery=sqlQuery, 

270 dbConn=self.dbConn 

271 ) 

272 astronoteIds = [] 

273 astronoteIds[:] = [l["astronote"] for l in rows] 

274 

275 self._parse_json_to_database(skipAstronoteIds=astronoteIds) 

276 self._parse_html_to_database(skipAstronoteIds=astronoteIds) 

277 

278 self.log.debug('completed the ``notes_to_database`` method') 

279 return None 

280 

281 def _create_db_tables( 

282 self): 

283 """*create the astronote database tables if they don't yet exist* 

284 

285 **Usage:** 

286 

287 ```python 

288 from transientNamer import astronotes 

289 an = astronotes( 

290 log=log, 

291 dbConn=dbConn, 

292 settings=settings 

293 ) 

294 an._create_db_tables() 

295 ``` 

296 """ 

297 self.log.debug('starting the ``_create_db_tables`` method') 

298 

299 from fundamentals.mysql import writequery 

300 sqlQueries = [] 

301 sqlQueries.append(f"""CREATE TABLE IF NOT EXISTS `astronotes_content` ( 

302 `primaryId` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'An internal counter', 

303 `dateCreated` datetime DEFAULT CURRENT_TIMESTAMP, 

304 `dateLastModified` datetime DEFAULT CURRENT_TIMESTAMP, 

305 `updated` tinyint(4) DEFAULT '0', 

306 `abstract` text, 

307 `astronote` varchar(30) NOT NULL, 

308 `authors` text, 

309 `public_timestamp` datetime DEFAULT NULL, 

310 `source_group` varchar(100) DEFAULT NULL, 

311 `title` text, 

312 `type` varchar(100) DEFAULT NULL, 

313 `html_parsed_flag` TINYINT NULL DEFAULT 0, 

314 PRIMARY KEY (`primaryId`), 

315 UNIQUE KEY `astronote` (`astronote`) 

316 ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8mb4; 

317 """) 

318 

319 sqlQueries.append(f"""CREATE TABLE IF NOT EXISTS `astronotes_keywords` ( 

320 `primaryId` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'An internal counter', 

321 `dateCreated` datetime DEFAULT CURRENT_TIMESTAMP, 

322 `dateLastModified` datetime DEFAULT CURRENT_TIMESTAMP, 

323 `updated` tinyint(4) DEFAULT '0', 

324 `astronote` varchar(30) NOT NULL, 

325 `keyword` varchar(30) NOT NULL, 

326 PRIMARY KEY (`primaryId`), 

327 UNIQUE KEY `astronote_keyword` (`astronote`,`keyword`) 

328 ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8mb4; 

329 """) 

330 

331 sqlQueries.append(f"""CREATE TABLE IF NOT EXISTS `astronotes_transients` ( 

332 `primaryId` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'An internal counter', 

333 `dateCreated` datetime DEFAULT CURRENT_TIMESTAMP, 

334 `dateLastModified` datetime DEFAULT CURRENT_TIMESTAMP, 

335 `updated` tinyint(4) DEFAULT '0', 

336 `alt_name` varchar(100) DEFAULT NULL, 

337 `astronote` varchar(30) NOT NULL, 

338 `decdeg` double DEFAULT NULL, 

339 `iauname` varchar(30) DEFAULT NULL, 

340 `iauname_prefix` varchar(10) DEFAULT NULL, 

341 `objtype` varchar(30) DEFAULT NULL, 

342 `radeg` double DEFAULT NULL, 

343 `redshift` double DEFAULT NULL, 

344 `catalog` varchar(100) DEFAULT NULL, 

345 `host_name` varchar(100) DEFAULT NULL, 

346 `host_redshift` double DEFAULT NULL, 

347 `phase` varchar(100) DEFAULT NULL, 

348 `remarks` varchar(100) DEFAULT NULL, 

349 `source` varchar(100) DEFAULT NULL, 

350 `date_observed` varchar(100) DEFAULT NULL, 

351 `fluxdensity_mu_jy` double DEFAULT NULL, 

352 `mean_obs_freq_ghz` double DEFAULT NULL, 

353 `time_observed_ut` varchar(100) DEFAULT NULL, 

354 `uncertainty_mu_jy` double DEFAULT NULL, 

355 PRIMARY KEY (`primaryId`), 

356 UNIQUE KEY `astronote_iauname` (`astronote`,`iauname`), 

357 UNIQUE KEY `astronote_alt_name` (`alt_name`,`astronote`) 

358 ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8mb4; 

359 """) 

360 

361 for sqlQuery in sqlQueries: 

362 writequery( 

363 log=self.log, 

364 sqlQuery=sqlQuery, 

365 dbConn=self.dbConn 

366 ) 

367 

368 self.log.debug('completed the ``_create_db_tables`` method') 

369 return None 

370 

371 def _parse_json_to_database( 

372 self, 

373 skipAstronoteIds): 

374 """*parse the cached json files and add content to mysql database tables* 

375 

376 **Key Arguments:** 

377 - `skipAstronoteIds` -- the astronote IDs already present in the database - do not reparse 

378 """ 

379 self.log.debug('starting the ``_parse_json_to_database`` method') 

380 

381 # GENERATE A LIST OF FILE PATHS 

382 jsonNotes = [] 

383 cache = self.settings["astronote-cache"] 

384 for d in os.listdir(cache): 

385 if d.split(".")[0] in skipAstronoteIds: 

386 continue 

387 filepath = os.path.join(cache, d) 

388 if os.path.isfile(filepath) and os.path.splitext(filepath)[1] == ".json": 

389 jsonNotes.append(filepath) 

390 

391 # NOW READ THE JSON FILES AND WRITE DICTIONARIES NEEDED FOR MYSQL 

392 # TABLES 

393 astronotes_content = [] 

394 astronotes_keywords = [] 

395 astronotes_transients = [] 

396 for file in jsonNotes: 

397 with open(file) as data_file: 

398 data = json.load(data_file) 

399 for k, v in data.items(): 

400 if not len(v): 

401 data[k] = None 

402 for k in data['keywords']: 

403 astronotes_keywords.append( 

404 {"astronote": data['astronote'], "keyword": k}) 

405 del data['keywords'] 

406 if 'related_objects' in data and data['related_objects']: 

407 for dict in data['related_objects']: 

408 try: 

409 del dict['ra'] 

410 del dict['dec'] 

411 except: 

412 pass 

413 dict['astronote'] = data['astronote'] 

414 for k, v in dict.items(): 

415 if not len(v): 

416 dict[k] = None 

417 

418 astronotes_transients.append(dict) 

419 del data['related_objects'] 

420 del data['related_astronotes'] 

421 

422 astronotes_content.append(data) 

423 

424 print(f"{len(astronotes_transients)} transients") 

425 

426 # INSERT LIST OF 

427 # DICTIONARIES INTO DATABASE 

428 insert_list_of_dictionaries_into_database_tables( 

429 dbConn=self.dbConn, 

430 log=self.log, 

431 dictList=astronotes_keywords, 

432 dbTableName="astronotes_keywords", 

433 uniqueKeyList=["astronote", "keyword"], 

434 dateModified=True, 

435 dateCreated=True, 

436 batchSize=2500, 

437 replace=True, 

438 dbSettings=self.settings["database settings"] 

439 ) 

440 

441 insert_list_of_dictionaries_into_database_tables( 

442 dbConn=self.dbConn, 

443 log=self.log, 

444 dictList=astronotes_transients, 

445 dbTableName="astronotes_transients", 

446 uniqueKeyList=["astronote", "iauname"], 

447 dateModified=True, 

448 dateCreated=True, 

449 batchSize=2500, 

450 replace=True, 

451 dbSettings=self.settings["database settings"] 

452 ) 

453 

454 insert_list_of_dictionaries_into_database_tables( 

455 dbConn=self.dbConn, 

456 log=self.log, 

457 dictList=astronotes_content, 

458 dbTableName="astronotes_content", 

459 uniqueKeyList=["astronote"], 

460 dateModified=True, 

461 dateCreated=True, 

462 batchSize=2500, 

463 replace=True, 

464 dbSettings=self.settings["database settings"] 

465 ) 

466 

467 self.log.debug('completed the ``_parse_json_to_database`` method') 

468 return None 

469 

470 def _parse_html_to_database( 

471 self, 

472 skipAstronoteIds): 

473 """*parse the HTML versions of the astronotes to database tables* 

474 

475 **Key Arguments:** 

476 - `skipAstronoteIds` -- the astronote IDs already present in the database - do not reparse 

477 """ 

478 self.log.debug('starting the ``_parse_html_to_database`` method') 

479 

480 # GENERATE A LIST OF FILE PATHS 

481 htmlNotes = [] 

482 noteIds = [] 

483 cache = self.settings["astronote-cache"] 

484 for d in os.listdir(cache): 

485 if d.split(".")[0] in skipAstronoteIds: 

486 continue 

487 filepath = os.path.join(cache, d) 

488 if os.path.isfile(filepath) and os.path.splitext(filepath)[1] == ".html": 

489 htmlNotes.append(filepath) 

490 noteIds.append(d.split(".")[0]) 

491 

492 for note, noteId in zip(htmlNotes, noteIds): 

493 with codecs.open(note, encoding='utf-8', mode='r') as readFile: 

494 content = readFile.read() 

495 

496 getpage_soup = BeautifulSoup(content, 'html.parser') 

497 table = getpage_soup.find( 

498 'table', {'class': 'objects-table'}) 

499 if not table: 

500 continue 

501 thead = table.find('thead') 

502 

503 # print(table) 

504 remove = re.compile(r'[\(\)\.]') 

505 underscore = re.compile(r'[ &;-]+') 

506 headerKeys = [] 

507 headerKeys[:] = [th.string.lower() 

508 for th in thead.findAll('th')] 

509 # print(headerKeys) 

510 headerKeys[:] = [remove.sub('', h) for h in headerKeys] 

511 headerKeys[:] = [underscore.sub('_', h) for h in headerKeys] 

512 headerKeys.append("alt_name") 

513 headerKeys[:] = [ 

514 h if (h != "name") else "iauname" for h in headerKeys] 

515 headerKeys[:] = [ 

516 h if (h != "phase_days") else "phase" for h in headerKeys] 

517 nameIndex = headerKeys.index("iauname") 

518 headerKeys.append("astronote") 

519 

520 results = [] 

521 for row in table.findAll('tr'): 

522 # print(row) 

523 cells = row.findAll('td') 

524 cellValues = [] 

525 cellValues[:] = [l.string for l in cells] 

526 if len(cellValues): 

527 try: 

528 cellValues.append(cellValues[nameIndex].split(" ")[ 

529 1].replace("[", "").replace("]", "")) 

530 except: 

531 cellValues.append(None) 

532 cellValues[nameIndex] = cellValues[nameIndex].split()[ 

533 0] 

534 cellValues.append(noteId) 

535 if len(headerKeys) == len(cellValues): 

536 transient = dict(zip(headerKeys, cellValues)) 

537 for h in headerKeys: 

538 if "reported_" in h or "tns_" in h: 

539 try: 

540 del transient[h] 

541 except: 

542 pass 

543 results.append(transient) 

544 

545 insert_list_of_dictionaries_into_database_tables( 

546 dbConn=self.dbConn, 

547 log=self.log, 

548 dictList=results, 

549 dbTableName="astronotes_transients", 

550 uniqueKeyList=["astronote", "iauname"], 

551 dateModified=True, 

552 dateCreated=True, 

553 batchSize=2500, 

554 replace=True, 

555 # dbSettings=self.settings["database settings"] 

556 ) 

557 

558 self.log.debug('completed the ``_parse_html_to_database`` method') 

559 return None 

560 

561 # use the tab-trigger below for new method 

562 # xt-class-method