Coverage for transientNamer/astronotes.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python
2# encoding: utf-8
3"""
4*Tools to download, parse and ingest astronote content into a MySQL database*
6:Author:
7 David Young
9:Date Created:
10 January 15, 2021
11"""
12from builtins import object
13import sys
14import re
15import os
16os.environ['TERM'] = 'vt100'
17from fundamentals import tools
18import requests
19import json
20from pprint import pprint
21from fundamentals.mysql import insert_list_of_dictionaries_into_database_tables
22from fundamentals.mysql import convert_dictionary_to_mysql_table
23from fundamentals.mysql import readquery
24import time
25import codecs
26from bs4 import BeautifulSoup
29class astronotes(object):
30 """
31 *Tools to download, parse and ingest astronote content into a MySQL database*
33 **Key Arguments:**
34 - ``log`` -- logger
35 - ``dbConn`` -- database connection. Default *False*
36 - ``settings`` -- the settings dictionary
38 **Usage:**
40 To setup your logger, settings and database connections, please use the ``fundamentals`` package (`see tutorial here <http://fundamentals.readthedocs.io/en/latest/#tutorial>`_).
42 To initiate a astronotes object, use the following:
44 ```python
45 from transientNamer import astronotes
46 an = astronotes(
47 log=log,
48 dbConn=dbConn,
49 settings=settings
50 )
51 ```
52 """
54 def __init__(
55 self,
56 log,
57 dbConn=False,
58 settings=False,
60 ):
61 self.log = log
62 log.debug("instansiating a new 'astronotes' object")
63 self.settings = settings
64 self.dbConn = dbConn
66 return None
68 def download(
69 self,
70 cache_dir,
71 inLastDays=False):
72 """*Download astronotes reported in the lasy N days. Check cache for notes alreaedy downloaded.*
74 **Key Arguments:**
75 - `cache_dir` -- the directory to cache the json notes to.
76 - `inLastDays` -- download only notes reported in the last N days. Default *False*. (Download all)
78 **Return:**
79 - `downloadCount` -- number of new files cached
81 **Usage:**
83 ```python
84 from transientNamer import astronotes
85 an = astronotes(
86 log=log,
87 dbConn=dbConn,
88 settings=settings
89 )
90 downloadCount = an.download(
91 cache_dir=settings["astronote-cache"], inLastDays=30)
92 print(f"{downloadCount} new astronotes downloaded anc cached")
93 ```
94 """
95 self.log.debug('starting the ``download`` method')
97 paginationSets = 50
98 page = 0
99 allNotes = {}
100 noteCount = paginationSets + 1
101 if not inLastDays:
102 inLastDays = 30000
104 # PAGINATE THROUGH RESULTS UNTIL WE HIT THE END
105 while noteCount >= paginationSets:
106 try:
107 response = requests.get(
108 url="https://www.wis-tns.org/astronotes",
109 params={
110 "posted_period_value": inLastDays,
111 "posted_period_units": "days",
112 "num_page": paginationSets,
113 "page": page,
114 "format": "json"
115 },
116 headers={
117 'User-Agent': self.settings["user-agent"]
118 }
119 )
120 searchPage = response.content.decode("utf-8")
121 except requests.exceptions.RequestException:
122 print('HTTP Request failed')
123 page += 1
124 data = json.loads(searchPage)
125 noteCount = len(data)
126 if noteCount:
127 allNotes = dict(list(allNotes.items()) + list(data.items()))
129 # RECURSIVELY CREATE MISSING DIRECTORIES FOR CACHE
130 if not os.path.exists(self.settings["astronote-cache"]):
131 os.makedirs(self.settings["astronote-cache"])
133 # DOWNLOAD ONE NOTE PER FILE
134 downloadCount = 0
135 noteIds = []
136 for k, v in allNotes.items():
137 noteIds.append(k)
138 filepath = self.settings["astronote-cache"] + f"/{k}.json"
139 vJson = json.dumps(
140 v,
141 separators=(',', ': '),
142 sort_keys=True,
143 indent=4
144 )
145 if not os.path.exists(filepath):
146 myFile = open(filepath, 'w')
147 myFile.write(vJson)
148 myFile.close()
149 downloadCount += 1
151 # NOW DOWNLOAD REQUIRED HTML NOTES
152 for n in noteIds:
153 filepath = self.settings["astronote-cache"] + f"/{n}.html"
154 if not os.path.exists(filepath):
155 time.sleep(1)
156 try:
157 response = requests.get(
158 url=f"https://www.wis-tns.org/astronotes/astronote/{n}",
159 headers={
160 'User-Agent': self.settings["user-agent"]
161 }
163 )
164 noteContent = response.content.decode("utf-8")
165 except requests.exceptions.RequestException:
166 print('HTTP Request failed')
167 myFile = open(filepath, 'w')
168 myFile.write(noteContent)
169 myFile.close()
171 self.log.debug('completed the ``download`` method')
172 return downloadCount
174 def get_all_noteids(
175 self,
176 inLastDays=False):
177 """*get the noteids of those notes released in the last N days*
179 **Key Arguments:**
180 - ``inLastDays`` -- report only notesIds released in the last N days. Default *False*. (Report all)
182 **Return:**
183 - `noteIds` -- list of all reported noteIds
185 **Usage:**
187 ```python
188 from transientNamer import astronotes
189 an = astronotes(
190 log=log,
191 settings=settings
192 )
193 noteIds = an.get_all_noteid(inLastDays=3000)
194 print(f"Astronote IDs: {noteIds}")
195 ```
196 """
197 self.log.debug('starting the ``get_all_noteids`` method')
199 paginationSets = 100
200 page = 0
201 noteIds = []
202 noteCount = paginationSets + 1
203 if not inLastDays:
204 inLastDays = 30000
206 # PAGINATE THROUGH RESULTS UNTIL WE HIT THE END
207 while noteCount >= paginationSets:
208 try:
209 response = requests.get(
210 url="https://www.wis-tns.org/astronotes",
211 params={
212 "posted_period_value": inLastDays,
213 "posted_period_units": "days",
214 "num_page": paginationSets,
215 "page": page
216 },
217 headers={
218 'User-Agent': self.settings["user-agent"]
219 }
220 )
221 searchPage = response.content.decode("utf-8")
222 except requests.exceptions.RequestException:
223 print('HTTP Request failed')
224 page += 1
226 # GET NOTES WRAPPER
227 getpage_soup = BeautifulSoup(searchPage, 'html.parser')
228 noteswrapper = getpage_soup.find(
229 'div', {'id': 'notes-wrapper'})
231 # NOW GET INDIVIDUAL NOTES
232 notelinks = noteswrapper.findAll('a', {'class': 'note-link'})
233 noteCount = len(notelinks)
235 for link in notelinks:
236 noteId = link.attrs['href'].split("/astronote/")[-1]
237 noteIds.append(noteId)
239 self.log.debug('completed the ``get_all_noteids`` method')
240 return noteIds
242 def notes_to_database(
243 self):
244 """*read the notes and import them into indexed MySQL database tables*
246 **Usage:**
248 ```python
249 from transientNamer import astronotes
250 an = astronotes(
251 log=log,
252 dbConn=dbConn,
253 settings=settings
254 )
255 an.notes_to_database()
256 ```
257 """
258 self.log.debug('starting the ``notes_to_database`` method')
260 # CREATE THE DATABASE TABLES IF THEY DON'T EXIST
261 self._create_db_tables()
263 # WHICH ASTRONOTES ARE ALREADY IN THE DATABASE
264 sqlQuery = u"""
265 select astronote FROM astronotes_content;
266 """ % locals()
267 rows = readquery(
268 log=self.log,
269 sqlQuery=sqlQuery,
270 dbConn=self.dbConn
271 )
272 astronoteIds = []
273 astronoteIds[:] = [l["astronote"] for l in rows]
275 self._parse_json_to_database(skipAstronoteIds=astronoteIds)
276 self._parse_html_to_database(skipAstronoteIds=astronoteIds)
278 self.log.debug('completed the ``notes_to_database`` method')
279 return None
281 def _create_db_tables(
282 self):
283 """*create the astronote database tables if they don't yet exist*
285 **Usage:**
287 ```python
288 from transientNamer import astronotes
289 an = astronotes(
290 log=log,
291 dbConn=dbConn,
292 settings=settings
293 )
294 an._create_db_tables()
295 ```
296 """
297 self.log.debug('starting the ``_create_db_tables`` method')
299 from fundamentals.mysql import writequery
300 sqlQueries = []
301 sqlQueries.append(f"""CREATE TABLE IF NOT EXISTS `astronotes_content` (
302 `primaryId` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'An internal counter',
303 `dateCreated` datetime DEFAULT CURRENT_TIMESTAMP,
304 `dateLastModified` datetime DEFAULT CURRENT_TIMESTAMP,
305 `updated` tinyint(4) DEFAULT '0',
306 `abstract` text,
307 `astronote` varchar(30) NOT NULL,
308 `authors` text,
309 `public_timestamp` datetime DEFAULT NULL,
310 `source_group` varchar(100) DEFAULT NULL,
311 `title` text,
312 `type` varchar(100) DEFAULT NULL,
313 `html_parsed_flag` TINYINT NULL DEFAULT 0,
314 PRIMARY KEY (`primaryId`),
315 UNIQUE KEY `astronote` (`astronote`)
316 ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8mb4;
317 """)
319 sqlQueries.append(f"""CREATE TABLE IF NOT EXISTS `astronotes_keywords` (
320 `primaryId` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'An internal counter',
321 `dateCreated` datetime DEFAULT CURRENT_TIMESTAMP,
322 `dateLastModified` datetime DEFAULT CURRENT_TIMESTAMP,
323 `updated` tinyint(4) DEFAULT '0',
324 `astronote` varchar(30) NOT NULL,
325 `keyword` varchar(30) NOT NULL,
326 PRIMARY KEY (`primaryId`),
327 UNIQUE KEY `astronote_keyword` (`astronote`,`keyword`)
328 ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8mb4;
329 """)
331 sqlQueries.append(f"""CREATE TABLE IF NOT EXISTS `astronotes_transients` (
332 `primaryId` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'An internal counter',
333 `dateCreated` datetime DEFAULT CURRENT_TIMESTAMP,
334 `dateLastModified` datetime DEFAULT CURRENT_TIMESTAMP,
335 `updated` tinyint(4) DEFAULT '0',
336 `alt_name` varchar(100) DEFAULT NULL,
337 `astronote` varchar(30) NOT NULL,
338 `decdeg` double DEFAULT NULL,
339 `iauname` varchar(30) DEFAULT NULL,
340 `iauname_prefix` varchar(10) DEFAULT NULL,
341 `objtype` varchar(30) DEFAULT NULL,
342 `radeg` double DEFAULT NULL,
343 `redshift` double DEFAULT NULL,
344 `catalog` varchar(100) DEFAULT NULL,
345 `host_name` varchar(100) DEFAULT NULL,
346 `host_redshift` double DEFAULT NULL,
347 `phase` varchar(100) DEFAULT NULL,
348 `remarks` varchar(100) DEFAULT NULL,
349 `source` varchar(100) DEFAULT NULL,
350 `date_observed` varchar(100) DEFAULT NULL,
351 `fluxdensity_mu_jy` double DEFAULT NULL,
352 `mean_obs_freq_ghz` double DEFAULT NULL,
353 `time_observed_ut` varchar(100) DEFAULT NULL,
354 `uncertainty_mu_jy` double DEFAULT NULL,
355 PRIMARY KEY (`primaryId`),
356 UNIQUE KEY `astronote_iauname` (`astronote`,`iauname`),
357 UNIQUE KEY `astronote_alt_name` (`alt_name`,`astronote`)
358 ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8mb4;
359 """)
361 for sqlQuery in sqlQueries:
362 writequery(
363 log=self.log,
364 sqlQuery=sqlQuery,
365 dbConn=self.dbConn
366 )
368 self.log.debug('completed the ``_create_db_tables`` method')
369 return None
371 def _parse_json_to_database(
372 self,
373 skipAstronoteIds):
374 """*parse the cached json files and add content to mysql database tables*
376 **Key Arguments:**
377 - `skipAstronoteIds` -- the astronote IDs already present in the database - do not reparse
378 """
379 self.log.debug('starting the ``_parse_json_to_database`` method')
381 # GENERATE A LIST OF FILE PATHS
382 jsonNotes = []
383 cache = self.settings["astronote-cache"]
384 for d in os.listdir(cache):
385 if d.split(".")[0] in skipAstronoteIds:
386 continue
387 filepath = os.path.join(cache, d)
388 if os.path.isfile(filepath) and os.path.splitext(filepath)[1] == ".json":
389 jsonNotes.append(filepath)
391 # NOW READ THE JSON FILES AND WRITE DICTIONARIES NEEDED FOR MYSQL
392 # TABLES
393 astronotes_content = []
394 astronotes_keywords = []
395 astronotes_transients = []
396 for file in jsonNotes:
397 with open(file) as data_file:
398 data = json.load(data_file)
399 for k, v in data.items():
400 if not len(v):
401 data[k] = None
402 for k in data['keywords']:
403 astronotes_keywords.append(
404 {"astronote": data['astronote'], "keyword": k})
405 del data['keywords']
406 if 'related_objects' in data and data['related_objects']:
407 for dict in data['related_objects']:
408 try:
409 del dict['ra']
410 del dict['dec']
411 except:
412 pass
413 dict['astronote'] = data['astronote']
414 for k, v in dict.items():
415 if not len(v):
416 dict[k] = None
418 astronotes_transients.append(dict)
419 del data['related_objects']
420 del data['related_astronotes']
422 astronotes_content.append(data)
424 print(f"{len(astronotes_transients)} transients")
426 # INSERT LIST OF
427 # DICTIONARIES INTO DATABASE
428 insert_list_of_dictionaries_into_database_tables(
429 dbConn=self.dbConn,
430 log=self.log,
431 dictList=astronotes_keywords,
432 dbTableName="astronotes_keywords",
433 uniqueKeyList=["astronote", "keyword"],
434 dateModified=True,
435 dateCreated=True,
436 batchSize=2500,
437 replace=True,
438 dbSettings=self.settings["database settings"]
439 )
441 insert_list_of_dictionaries_into_database_tables(
442 dbConn=self.dbConn,
443 log=self.log,
444 dictList=astronotes_transients,
445 dbTableName="astronotes_transients",
446 uniqueKeyList=["astronote", "iauname"],
447 dateModified=True,
448 dateCreated=True,
449 batchSize=2500,
450 replace=True,
451 dbSettings=self.settings["database settings"]
452 )
454 insert_list_of_dictionaries_into_database_tables(
455 dbConn=self.dbConn,
456 log=self.log,
457 dictList=astronotes_content,
458 dbTableName="astronotes_content",
459 uniqueKeyList=["astronote"],
460 dateModified=True,
461 dateCreated=True,
462 batchSize=2500,
463 replace=True,
464 dbSettings=self.settings["database settings"]
465 )
467 self.log.debug('completed the ``_parse_json_to_database`` method')
468 return None
470 def _parse_html_to_database(
471 self,
472 skipAstronoteIds):
473 """*parse the HTML versions of the astronotes to database tables*
475 **Key Arguments:**
476 - `skipAstronoteIds` -- the astronote IDs already present in the database - do not reparse
477 """
478 self.log.debug('starting the ``_parse_html_to_database`` method')
480 # GENERATE A LIST OF FILE PATHS
481 htmlNotes = []
482 noteIds = []
483 cache = self.settings["astronote-cache"]
484 for d in os.listdir(cache):
485 if d.split(".")[0] in skipAstronoteIds:
486 continue
487 filepath = os.path.join(cache, d)
488 if os.path.isfile(filepath) and os.path.splitext(filepath)[1] == ".html":
489 htmlNotes.append(filepath)
490 noteIds.append(d.split(".")[0])
492 for note, noteId in zip(htmlNotes, noteIds):
493 with codecs.open(note, encoding='utf-8', mode='r') as readFile:
494 content = readFile.read()
496 getpage_soup = BeautifulSoup(content, 'html.parser')
497 table = getpage_soup.find(
498 'table', {'class': 'objects-table'})
499 if not table:
500 continue
501 thead = table.find('thead')
503 # print(table)
504 remove = re.compile(r'[\(\)\.]')
505 underscore = re.compile(r'[ &;-]+')
506 headerKeys = []
507 headerKeys[:] = [th.string.lower()
508 for th in thead.findAll('th')]
509 # print(headerKeys)
510 headerKeys[:] = [remove.sub('', h) for h in headerKeys]
511 headerKeys[:] = [underscore.sub('_', h) for h in headerKeys]
512 headerKeys.append("alt_name")
513 headerKeys[:] = [
514 h if (h != "name") else "iauname" for h in headerKeys]
515 headerKeys[:] = [
516 h if (h != "phase_days") else "phase" for h in headerKeys]
517 nameIndex = headerKeys.index("iauname")
518 headerKeys.append("astronote")
520 results = []
521 for row in table.findAll('tr'):
522 # print(row)
523 cells = row.findAll('td')
524 cellValues = []
525 cellValues[:] = [l.string for l in cells]
526 if len(cellValues):
527 try:
528 cellValues.append(cellValues[nameIndex].split(" ")[
529 1].replace("[", "").replace("]", ""))
530 except:
531 cellValues.append(None)
532 cellValues[nameIndex] = cellValues[nameIndex].split()[
533 0]
534 cellValues.append(noteId)
535 if len(headerKeys) == len(cellValues):
536 transient = dict(zip(headerKeys, cellValues))
537 for h in headerKeys:
538 if "reported_" in h or "tns_" in h:
539 try:
540 del transient[h]
541 except:
542 pass
543 results.append(transient)
545 insert_list_of_dictionaries_into_database_tables(
546 dbConn=self.dbConn,
547 log=self.log,
548 dictList=results,
549 dbTableName="astronotes_transients",
550 uniqueKeyList=["astronote", "iauname"],
551 dateModified=True,
552 dateCreated=True,
553 batchSize=2500,
554 replace=True,
555 # dbSettings=self.settings["database settings"]
556 )
558 self.log.debug('completed the ``_parse_html_to_database`` method')
559 return None
561 # use the tab-trigger below for new method
562 # xt-class-method