Coverage for transientNamer/astronotes.py: 11%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1#!/usr/bin/env python

2# encoding: utf-8

3"""

4*Tools to download, parse and ingest astronote content into a MySQL database*

6:Author:

7 David Young

9:Date Created:

10 January 15, 2021

11"""

12from builtins import object

13import sys

14import re

15import os

16os.environ['TERM'] = 'vt100'

17from fundamentals import tools

18import requests

19import json

20from pprint import pprint

21from fundamentals.mysql import insert_list_of_dictionaries_into_database_tables

22from fundamentals.mysql import convert_dictionary_to_mysql_table

23from fundamentals.mysql import readquery

24import time

25import codecs

26from bs4 import BeautifulSoup

29class astronotes(object):

30 """

31 *Tools to download, parse and ingest astronote content into a MySQL database*

33 **Key Arguments:**

34 - ``log`` -- logger

35 - ``dbConn`` -- database connection. Default *False*

36 - ``settings`` -- the settings dictionary

38 **Usage:**

40 To setup your logger, settings and database connections, please use the ``fundamentals`` package (`see tutorial here <http://fundamentals.readthedocs.io/en/latest/#tutorial>`_).

42 To initiate a astronotes object, use the following:

44 ```python

45 from transientNamer import astronotes

46 an = astronotes(

47 log=log,

48 dbConn=dbConn,

49 settings=settings

50 )

51 ```

52 """

54 def __init__(

55 self,

56 log,

57 dbConn=False,

58 settings=False,

60 ):

61 self.log = log

62 log.debug("instansiating a new 'astronotes' object")

63 self.settings = settings

64 self.dbConn = dbConn

66 return None

68 def download(

69 self,

70 cache_dir,

71 inLastDays=False):

72 """*Download astronotes reported in the lasy N days. Check cache for notes alreaedy downloaded.*

74 **Key Arguments:**

75 - `cache_dir` -- the directory to cache the json notes to.

76 - `inLastDays` -- download only notes reported in the last N days. Default *False*. (Download all)

78 **Return:**

79 - `downloadCount` -- number of new files cached

81 **Usage:**

83 ```python

84 from transientNamer import astronotes

85 an = astronotes(

86 log=log,

87 dbConn=dbConn,

88 settings=settings

89 )

90 downloadCount = an.download(

91 cache_dir=settings["astronote-cache"], inLastDays=30)

92 print(f"{downloadCount} new astronotes downloaded anc cached")

93 ```

94 """

95 self.log.debug('starting the ``download`` method')

97 paginationSets = 50

98 page = 0

99 allNotes = {}

100 noteCount = paginationSets + 1

101 if not inLastDays:

102 inLastDays = 30000

103

104 # PAGINATE THROUGH RESULTS UNTIL WE HIT THE END

105 while noteCount >= paginationSets:

106 try:

107 response = requests.get(

108 url="https://www.wis-tns.org/astronotes",

109 params={

110 "posted_period_value": inLastDays,

111 "posted_period_units": "days",

112 "num_page": paginationSets,

113 "page": page,

114 "format": "json"

115 },

116 headers={

117 'User-Agent': self.settings["user-agent"]

118 }

119 )

120 searchPage = response.content.decode("utf-8")

121 except requests.exceptions.RequestException:

122 print('HTTP Request failed')

123 page += 1

124 data = json.loads(searchPage)

125 noteCount = len(data)

126 if noteCount:

127 allNotes = dict(list(allNotes.items()) + list(data.items()))

128

129 # RECURSIVELY CREATE MISSING DIRECTORIES FOR CACHE

130 if not os.path.exists(self.settings["astronote-cache"]):

131 os.makedirs(self.settings["astronote-cache"])

132

133 # DOWNLOAD ONE NOTE PER FILE

134 downloadCount = 0

135 noteIds = []

136 for k, v in allNotes.items():

137 noteIds.append(k)

138 filepath = self.settings["astronote-cache"] + f"/{k}.json"

139 vJson = json.dumps(

140 v,

141 separators=(',', ': '),

142 sort_keys=True,

143 indent=4

144 )

145 if not os.path.exists(filepath):

146 myFile = open(filepath, 'w')

147 myFile.write(vJson)

148 myFile.close()

149 downloadCount += 1

150

151 # NOW DOWNLOAD REQUIRED HTML NOTES

152 for n in noteIds:

153 filepath = self.settings["astronote-cache"] + f"/{n}.html"

154 if not os.path.exists(filepath):

155 time.sleep(1)

156 try:

157 response = requests.get(

158 url=f"https://www.wis-tns.org/astronotes/astronote/{n}",

159 headers={

160 'User-Agent': self.settings["user-agent"]

161 }

162

163 )

164 noteContent = response.content.decode("utf-8")

165 except requests.exceptions.RequestException:

166 print('HTTP Request failed')

167 myFile = open(filepath, 'w')

168 myFile.write(noteContent)

169 myFile.close()

170

171 self.log.debug('completed the ``download`` method')

172 return downloadCount

173

174 def get_all_noteids(

175 self,

176 inLastDays=False):

177 """*get the noteids of those notes released in the last N days*

178

179 **Key Arguments:**

180 - ``inLastDays`` -- report only notesIds released in the last N days. Default *False*. (Report all)

181

182 **Return:**

183 - `noteIds` -- list of all reported noteIds

184

185 **Usage:**

186

187 ```python

188 from transientNamer import astronotes

189 an = astronotes(

190 log=log,

191 settings=settings

192 )

193 noteIds = an.get_all_noteid(inLastDays=3000)

194 print(f"Astronote IDs: {noteIds}")

195 ```

196 """

197 self.log.debug('starting the ``get_all_noteids`` method')

198

199 paginationSets = 100

200 page = 0

201 noteIds = []

202 noteCount = paginationSets + 1

203 if not inLastDays:

204 inLastDays = 30000

205

206 # PAGINATE THROUGH RESULTS UNTIL WE HIT THE END

207 while noteCount >= paginationSets:

208 try:

209 response = requests.get(

210 url="https://www.wis-tns.org/astronotes",

211 params={

212 "posted_period_value": inLastDays,

213 "posted_period_units": "days",

214 "num_page": paginationSets,

215 "page": page

216 },

217 headers={

218 'User-Agent': self.settings["user-agent"]

219 }

220 )

221 searchPage = response.content.decode("utf-8")

222 except requests.exceptions.RequestException:

223 print('HTTP Request failed')

224 page += 1

225

226 # GET NOTES WRAPPER

227 getpage_soup = BeautifulSoup(searchPage, 'html.parser')

228 noteswrapper = getpage_soup.find(

229 'div', {'id': 'notes-wrapper'})

230

231 # NOW GET INDIVIDUAL NOTES

232 notelinks = noteswrapper.findAll('a', {'class': 'note-link'})

233 noteCount = len(notelinks)

234

235 for link in notelinks:

236 noteId = link.attrs['href'].split("/astronote/")[-1]

237 noteIds.append(noteId)

238

239 self.log.debug('completed the ``get_all_noteids`` method')

240 return noteIds

241

242 def notes_to_database(

243 self):

244 """*read the notes and import them into indexed MySQL database tables*

245

246 **Usage:**

247

248 ```python

249 from transientNamer import astronotes

250 an = astronotes(

251 log=log,

252 dbConn=dbConn,

253 settings=settings

254 )

255 an.notes_to_database()

256 ```

257 """

258 self.log.debug('starting the ``notes_to_database`` method')

259

260 # CREATE THE DATABASE TABLES IF THEY DON'T EXIST

261 self._create_db_tables()

262

263 # WHICH ASTRONOTES ARE ALREADY IN THE DATABASE

264 sqlQuery = u"""

265 select astronote FROM astronotes_content;

266 """ % locals()

267 rows = readquery(

268 log=self.log,

269 sqlQuery=sqlQuery,

270 dbConn=self.dbConn

271 )

272 astronoteIds = []

273 astronoteIds[:] = [l["astronote"] for l in rows]

274

275 self._parse_json_to_database(skipAstronoteIds=astronoteIds)

276 self._parse_html_to_database(skipAstronoteIds=astronoteIds)

277

278 self.log.debug('completed the ``notes_to_database`` method')

279 return None

280

281 def _create_db_tables(

282 self):

283 """*create the astronote database tables if they don't yet exist*

284

285 **Usage:**

286

287 ```python

288 from transientNamer import astronotes

289 an = astronotes(

290 log=log,

291 dbConn=dbConn,

292 settings=settings

293 )

294 an._create_db_tables()

295 ```

296 """

297 self.log.debug('starting the ``_create_db_tables`` method')

298

299 from fundamentals.mysql import writequery

300 sqlQueries = []

301 sqlQueries.append(f"""CREATE TABLE IF NOT EXISTS `astronotes_content` (

302 `primaryId` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'An internal counter',

303 `dateCreated` datetime DEFAULT CURRENT_TIMESTAMP,

304 `dateLastModified` datetime DEFAULT CURRENT_TIMESTAMP,

305 `updated` tinyint(4) DEFAULT '0',

306 `abstract` text,

307 `astronote` varchar(30) NOT NULL,

308 `authors` text,

309 `public_timestamp` datetime DEFAULT NULL,

310 `source_group` varchar(100) DEFAULT NULL,

311 `title` text,

312 `type` varchar(100) DEFAULT NULL,

313 `html_parsed_flag` TINYINT NULL DEFAULT 0,

314 PRIMARY KEY (`primaryId`),

315 UNIQUE KEY `astronote` (`astronote`)

316 ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8mb4;

317 """)

318

319 sqlQueries.append(f"""CREATE TABLE IF NOT EXISTS `astronotes_keywords` (

320 `primaryId` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'An internal counter',

321 `dateCreated` datetime DEFAULT CURRENT_TIMESTAMP,

322 `dateLastModified` datetime DEFAULT CURRENT_TIMESTAMP,

323 `updated` tinyint(4) DEFAULT '0',

324 `astronote` varchar(30) NOT NULL,

325 `keyword` varchar(30) NOT NULL,

326 PRIMARY KEY (`primaryId`),

327 UNIQUE KEY `astronote_keyword` (`astronote`,`keyword`)

328 ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8mb4;

329 """)

330

331 sqlQueries.append(f"""CREATE TABLE IF NOT EXISTS `astronotes_transients` (

332 `primaryId` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'An internal counter',

333 `dateCreated` datetime DEFAULT CURRENT_TIMESTAMP,

334 `dateLastModified` datetime DEFAULT CURRENT_TIMESTAMP,

335 `updated` tinyint(4) DEFAULT '0',

336 `alt_name` varchar(100) DEFAULT NULL,

337 `astronote` varchar(30) NOT NULL,

338 `decdeg` double DEFAULT NULL,

339 `iauname` varchar(30) DEFAULT NULL,

340 `iauname_prefix` varchar(10) DEFAULT NULL,

341 `objtype` varchar(30) DEFAULT NULL,

342 `radeg` double DEFAULT NULL,

343 `redshift` double DEFAULT NULL,

344 `catalog` varchar(100) DEFAULT NULL,

345 `host_name` varchar(100) DEFAULT NULL,

346 `host_redshift` double DEFAULT NULL,

347 `phase` varchar(100) DEFAULT NULL,

348 `remarks` varchar(100) DEFAULT NULL,

349 `source` varchar(100) DEFAULT NULL,

350 `date_observed` varchar(100) DEFAULT NULL,

351 `fluxdensity_mu_jy` double DEFAULT NULL,

352 `mean_obs_freq_ghz` double DEFAULT NULL,

353 `time_observed_ut` varchar(100) DEFAULT NULL,

354 `uncertainty_mu_jy` double DEFAULT NULL,

355 PRIMARY KEY (`primaryId`),

356 UNIQUE KEY `astronote_iauname` (`astronote`,`iauname`),

357 UNIQUE KEY `astronote_alt_name` (`alt_name`,`astronote`)

358 ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8mb4;

359 """)

360

361 for sqlQuery in sqlQueries:

362 writequery(

363 log=self.log,

364 sqlQuery=sqlQuery,

365 dbConn=self.dbConn

366 )

367

368 self.log.debug('completed the ``_create_db_tables`` method')

369 return None

370

371 def _parse_json_to_database(

372 self,

373 skipAstronoteIds):

374 """*parse the cached json files and add content to mysql database tables*

375

376 **Key Arguments:**

377 - `skipAstronoteIds` -- the astronote IDs already present in the database - do not reparse

378 """

379 self.log.debug('starting the ``_parse_json_to_database`` method')

380

381 # GENERATE A LIST OF FILE PATHS

382 jsonNotes = []

383 cache = self.settings["astronote-cache"]

384 for d in os.listdir(cache):

385 if d.split(".")[0] in skipAstronoteIds:

386 continue

387 filepath = os.path.join(cache, d)

388 if os.path.isfile(filepath) and os.path.splitext(filepath)[1] == ".json":

389 jsonNotes.append(filepath)

390

391 # NOW READ THE JSON FILES AND WRITE DICTIONARIES NEEDED FOR MYSQL

392 # TABLES

393 astronotes_content = []

394 astronotes_keywords = []

395 astronotes_transients = []

396 for file in jsonNotes:

397 with open(file) as data_file:

398 data = json.load(data_file)

399 for k, v in data.items():

400 if not len(v):

401 data[k] = None

402 for k in data['keywords']:

403 astronotes_keywords.append(

404 {"astronote": data['astronote'], "keyword": k})

405 del data['keywords']

406 if 'related_objects' in data and data['related_objects']:

407 for dict in data['related_objects']:

408 try:

409 del dict['ra']

410 del dict['dec']

411 except:

412 pass

413 dict['astronote'] = data['astronote']

414 for k, v in dict.items():

415 if not len(v):

416 dict[k] = None

417

418 astronotes_transients.append(dict)

419 del data['related_objects']

420 del data['related_astronotes']

421

422 astronotes_content.append(data)

423

424 print(f"{len(astronotes_transients)} transients")

425

426 # INSERT LIST OF

427 # DICTIONARIES INTO DATABASE

428 insert_list_of_dictionaries_into_database_tables(

429 dbConn=self.dbConn,

430 log=self.log,

431 dictList=astronotes_keywords,

432 dbTableName="astronotes_keywords",

433 uniqueKeyList=["astronote", "keyword"],

434 dateModified=True,

435 dateCreated=True,

436 batchSize=2500,

437 replace=True,

438 dbSettings=self.settings["database settings"]

439 )

440

441 insert_list_of_dictionaries_into_database_tables(

442 dbConn=self.dbConn,

443 log=self.log,

444 dictList=astronotes_transients,

445 dbTableName="astronotes_transients",

446 uniqueKeyList=["astronote", "iauname"],

447 dateModified=True,

448 dateCreated=True,

449 batchSize=2500,

450 replace=True,

451 dbSettings=self.settings["database settings"]

452 )

453

454 insert_list_of_dictionaries_into_database_tables(

455 dbConn=self.dbConn,

456 log=self.log,

457 dictList=astronotes_content,

458 dbTableName="astronotes_content",

459 uniqueKeyList=["astronote"],

460 dateModified=True,

461 dateCreated=True,

462 batchSize=2500,

463 replace=True,

464 dbSettings=self.settings["database settings"]

465 )

466

467 self.log.debug('completed the ``_parse_json_to_database`` method')

468 return None

469

470 def _parse_html_to_database(

471 self,

472 skipAstronoteIds):

473 """*parse the HTML versions of the astronotes to database tables*

474

475 **Key Arguments:**

476 - `skipAstronoteIds` -- the astronote IDs already present in the database - do not reparse

477 """

478 self.log.debug('starting the ``_parse_html_to_database`` method')

479

480 # GENERATE A LIST OF FILE PATHS

481 htmlNotes = []

482 noteIds = []

483 cache = self.settings["astronote-cache"]

484 for d in os.listdir(cache):

485 if d.split(".")[0] in skipAstronoteIds:

486 continue

487 filepath = os.path.join(cache, d)

488 if os.path.isfile(filepath) and os.path.splitext(filepath)[1] == ".html":

489 htmlNotes.append(filepath)

490 noteIds.append(d.split(".")[0])

491

492 for note, noteId in zip(htmlNotes, noteIds):

493 with codecs.open(note, encoding='utf-8', mode='r') as readFile:

494 content = readFile.read()

495

496 getpage_soup = BeautifulSoup(content, 'html.parser')

497 table = getpage_soup.find(

498 'table', {'class': 'objects-table'})

499 if not table:

500 continue

501 thead = table.find('thead')

502

503 # print(table)

504 remove = re.compile(r'[\(\)\.]')

505 underscore = re.compile(r'[ &;-]+')

506 headerKeys = []

507 headerKeys[:] = [th.string.lower()

508 for th in thead.findAll('th')]

509 # print(headerKeys)

510 headerKeys[:] = [remove.sub('', h) for h in headerKeys]

511 headerKeys[:] = [underscore.sub('_', h) for h in headerKeys]

512 headerKeys.append("alt_name")

513 headerKeys[:] = [

514 h if (h != "name") else "iauname" for h in headerKeys]

515 headerKeys[:] = [

516 h if (h != "phase_days") else "phase" for h in headerKeys]

517 nameIndex = headerKeys.index("iauname")

518 headerKeys.append("astronote")

519

520 results = []

521 for row in table.findAll('tr'):

522 # print(row)

523 cells = row.findAll('td')

524 cellValues = []

525 cellValues[:] = [l.string for l in cells]

526 if len(cellValues):

527 try:

528 cellValues.append(cellValues[nameIndex].split(" ")[

529 1].replace("[", "").replace("]", ""))

530 except:

531 cellValues.append(None)

532 cellValues[nameIndex] = cellValues[nameIndex].split()[

533 0]

534 cellValues.append(noteId)

535 if len(headerKeys) == len(cellValues):

536 transient = dict(zip(headerKeys, cellValues))

537 for h in headerKeys:

538 if "reported_" in h or "tns_" in h:

539 try:

540 del transient[h]

541 except:

542 pass

543 results.append(transient)

544

545 insert_list_of_dictionaries_into_database_tables(

546 dbConn=self.dbConn,

547 log=self.log,

548 dictList=results,

549 dbTableName="astronotes_transients",

550 uniqueKeyList=["astronote", "iauname"],

551 dateModified=True,

552 dateCreated=True,

553 batchSize=2500,

554 replace=True,

555 # dbSettings=self.settings["database settings"]

556 )

557

558 self.log.debug('completed the ``_parse_html_to_database`` method')

559 return None

560

561 # use the tab-trigger below for new method

562 # xt-class-method

Coverage for transientNamer/astronotes.py : 11%

213 statements 24 run 189 missing 0 excluded