Coverage for sherlock/imports/ned.py: 90%
155 statements
« prev ^ index » next coverage.py v7.2.2, created at 2023-10-10 13:58 +0000
« prev ^ index » next coverage.py v7.2.2, created at 2023-10-10 13:58 +0000
1#!/usr/local/bin/python
2# encoding: utf-8
3"""
4*Import ned stream into sherlock-catalogues database*
6:Author:
7 David Young
8"""
9from __future__ import print_function, division
10from ._base_importer import _base_importer
11from fundamentals.mysql import directory_script_runner, readquery, writequery
12from fundamentals.renderer import list_of_dictionaries
13from astrocalc.coords import unit_conversion
14from fundamentals.mysql import insert_list_of_dictionaries_into_database_tables
15from HMpTy.mysql import add_htm_ids_to_mysql_database_table
16from neddy import namesearch, conesearch
17from docopt import docopt
18from datetime import datetime, date, time
19import re
20import string
21import codecs
22import pickle
23import glob
24import readline
26from past.utils import old_div
27import sys
28import os
29os.environ['TERM'] = 'vt100'
32class ned(_base_importer):
33 """
34 *Using a list of coordinates, query the online* `NED <https://ned.ipac.caltech.edu/>`_ *database and import sources found within a given search radius of each of the loctions into the sherlock-catalogues database*
36 The code:
38 1. Uses the list of transient coordinates and queries NED (conesearch) for the results within the given search radius
39 2. Creates the `tcs_cat_ned_stream` table if it doesn't exist
40 3. Adds the resulting matched NED IDs/Names to the `tcs_cat_ned_stream` table
41 4. Updates the NED query history table
42 5. Queris NED via NED IDs (object search) for the remaining source metadata to be added to the `tcs_cat_ned_stream` table
44 Note it's up to the user to filter the input coordinate list by checking whether or not the same area of the sky has been imported into the `tcs_cat_ned_stream` table recently (by checking the `tcs_helper_ned_query_history` table)
46 **Key Arguments**
48 - ``dbConn`` -- mysql database connection
49 - ``log`` -- logger
50 - ``settings`` -- the settings dictionary
51 - ``coordinateList`` -- list of coordinates (a list of strings with RA and DEC space separated)
52 - ``radiusArcsec`` - - the radius in arcsec with which to perform the initial NED conesearch. Default * False*
55 **Usage**
57 To import the ned catalogue stream, run the following:
60 ```python
61 from sherlock.imports import ned
62 ```
64 stream = ned(
65 log=log,
66 settings=settings,
67 coordinateList=["23.12323 -12.34343","345.43234 45.26789"],
68 radiusArcsec=180
69 )
70 stream.ingest()
72 .. todo ::
74 - test this code is still working after changes
75 - add option to filter coordinate list via the `tcs_helper_ned_query_history` table
76 - check sublime snippet exists
77 - clip any useful text to docs mindmap
78 """
79 # INITIALISATION
81 def ingest(self):
82 """*Perform conesearches of the online NED database and import the results into a the sherlock-database*
84 The code:
86 1. uses the list of transient coordinates and queries NED for the results within the given search radius
87 2. Creates the `tcs_cat_ned_stream` table if it doesn't exist
88 3. Adds the resulting NED IDs/Names to the `tcs_cat_ned_stream` table
89 4. Updates the NED query history table
90 5. Queris NED via NED IDs for the remaining source metadata to be added to the `tcs_cat_ned_stream` table
92 **Usage**
94 Having setup the NED object with a coordinate list and cone-search radius, run the `ingest()` method
96 ```python
97 stream.ingest()
98 ```
101 .. todo ::
103 - check sublime snippet exists
104 - clip any useful text to docs mindmap
105 - regenerate the docs and check redendering of this docstring
106 """
107 self.log.debug('starting the ``ingest`` method')
109 if not self.radiusArcsec:
110 self.log.error(
111 'please give a radius in arcsec with which to preform the initial NED conesearch' % locals())
112 sys.exit(0)
114 # VARIABLES
115 # SIZE OF NUMBER OF ROWS TO INSERT INTO DATABASE TABLE AT ANY ONE GO
116 self.databaseInsertbatchSize = 10000
118 # THE DATABASE TABLE TO STREAM THE NED DATA INTO
119 self.dbTableName = "tcs_cat_ned_stream"
121 dictList = self._create_dictionary_of_ned()
123 tableName = self.dbTableName
125 createStatement = """CREATE TABLE IF NOT EXISTS `%(tableName)s` (
126 `primaryId` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'An internal counter',
127 `ned_name` varchar(150) NOT NULL,
128 `redshift` double DEFAULT NULL,
129 `dateCreated` datetime DEFAULT CURRENT_TIMESTAMP,
130 `dateLastModified` datetime DEFAULT CURRENT_TIMESTAMP,
131 `updated` varchar(45) DEFAULT '0',
132 `major_diameter_arcmin` double DEFAULT NULL,
133 `ned_notes` varchar(700) DEFAULT NULL,
134 `object_type` varchar(100) DEFAULT NULL,
135 `redshift_err` double DEFAULT NULL,
136 `redshift_quality` varchar(100) DEFAULT NULL,
137 `magnitude_filter` varchar(10) DEFAULT NULL,
138 `minor_diameter_arcmin` double DEFAULT NULL,
139 `morphology` varchar(50) DEFAULT NULL,
140 `hierarchy` varchar(50) DEFAULT NULL,
141 `galaxy_morphology` varchar(50) DEFAULT NULL,
142 `radio_morphology` varchar(50) DEFAULT NULL,
143 `activity_type` varchar(50) DEFAULT NULL,
144 `raDeg` double DEFAULT NULL,
145 `decDeg` double DEFAULT NULL,
146 `eb_v` double DEFAULT NULL,
147 `htm16ID` bigint(20) DEFAULT NULL,
148 `download_error` tinyint(1) DEFAULT '0',
149 `htm10ID` bigint(20) DEFAULT NULL,
150 `htm13ID` bigint(20) DEFAULT NULL,
151 PRIMARY KEY (`primaryId`),
152 UNIQUE KEY `ned_name` (`ned_name`),
153 KEY `idx_htm16ID` (`htm16ID`),
154 KEY `raDeg` (`raDeg`),
155 KEY `downloadError` (`download_error`),
156 KEY `idx_htm10ID` (`htm10ID`),
157 KEY `idx_htm13ID` (`htm13ID`)
158) ENGINE=MyISAM AUTO_INCREMENT=0 DEFAULT CHARSET=latin1;
159""" % locals()
161 self.add_data_to_database_table(
162 dictList=dictList,
163 createStatement=createStatement
164 )
166 self._update_ned_query_history()
167 self._download_ned_source_metadata()
169 self.log.debug('completed the ``ingest`` method')
170 return None
172 def _create_dictionary_of_ned(
173 self):
174 """*Create a list of dictionaries containing all the object ids (NED names) in the ned stream*
176 **Return**
178 - ``dictList`` - a list of dictionaries containing all the object ids (NED names) in the ned stream
181 **Usage**
183 ```python
184 dictList = stream._create_dictionary_of_ned()
185 ```
187 """
188 self.log.debug(
189 'starting the ``_create_dictionary_of_ned`` method')
191 # GET THE NAMES (UNIQUE IDS) OF THE SOURCES WITHIN THE CONESEARCH FROM
192 # NED
193 names, searchParams = conesearch(
194 log=self.log,
195 radiusArcsec=self.radiusArcsec,
196 nearestOnly=False,
197 unclassified=True,
198 quiet=False,
199 listOfCoordinates=self.coordinateList,
200 outputFilePath=False,
201 verbose=False
202 ).get_crossmatch_names()
204 dictList = []
205 dictList[:] = [{"ned_name": n} for n in names]
207 self.log.debug(
208 'completed the ``_create_dictionary_of_ned`` method')
209 return dictList
211 def _update_ned_query_history(
212 self):
213 """*Update the database helper table to give details of the ned cone searches performed*
215 *Usage:*
217 ```python
218 stream._update_ned_query_history()
219 ```
220 """
221 self.log.debug('starting the ``_update_ned_query_history`` method')
223 myPid = self.myPid
225 # ASTROCALC UNIT CONVERTER OBJECT
226 converter = unit_conversion(
227 log=self.log
228 )
230 # UPDATE THE DATABASE HELPER TABLE TO GIVE DETAILS OF THE NED CONE
231 # SEARCHES PERFORMED
232 dataList = []
233 for i, coord in enumerate(self.coordinateList):
234 if isinstance(coord, ("".__class__, u"".__class__)):
235 ra = coord.split(" ")[0]
236 dec = coord.split(" ")[1]
237 elif isinstance(coord, tuple) or isinstance(coord, list):
238 ra = coord[0]
239 dec = coord[1]
241 dataList.append(
242 {"raDeg": ra,
243 "decDeg": dec,
244 "arcsecRadius": self.radiusArcsec}
245 )
247 if len(dataList) == 0:
248 return None
250 # CREATE TABLE IF NOT EXIST
251 createStatement = """CREATE TABLE IF NOT EXISTS `tcs_helper_ned_query_history` (
252 `primaryId` bigint(20) NOT NULL AUTO_INCREMENT,
253 `raDeg` double DEFAULT NULL,
254 `decDeg` double DEFAULT NULL,
255 `dateCreated` datetime DEFAULT CURRENT_TIMESTAMP,
256 `dateLastModified` datetime DEFAULT CURRENT_TIMESTAMP,
257 `updated` varchar(45) DEFAULT '0',
258 `arcsecRadius` int(11) DEFAULT NULL,
259 `dateQueried` datetime DEFAULT CURRENT_TIMESTAMP,
260 `htm16ID` bigint(20) DEFAULT NULL,
261 `htm13ID` int(11) DEFAULT NULL,
262 `htm10ID` int(11) DEFAULT NULL,
263 PRIMARY KEY (`primaryId`),
264 KEY `idx_htm16ID` (`htm16ID`),
265 KEY `dateQueried` (`dateQueried`),
266 KEY `dateHtm16` (`dateQueried`,`htm16ID`),
267 KEY `idx_htm10ID` (`htm10ID`),
268 KEY `idx_htm13ID` (`htm13ID`)
269) ENGINE=MyISAM AUTO_INCREMENT=0 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
270 """
271 writequery(
272 log=self.log,
273 sqlQuery=createStatement,
274 dbConn=self.cataloguesDbConn
275 )
277 # USE dbSettings TO ACTIVATE MULTIPROCESSING
278 insert_list_of_dictionaries_into_database_tables(
279 dbConn=self.cataloguesDbConn,
280 log=self.log,
281 dictList=dataList,
282 dbTableName="tcs_helper_ned_query_history",
283 uniqueKeyList=[],
284 dateModified=True,
285 batchSize=10000,
286 replace=True,
287 dbSettings=self.settings["database settings"][
288 "static catalogues"]
289 )
291 # INDEX THE TABLE FOR LATER SEARCHES
292 add_htm_ids_to_mysql_database_table(
293 raColName="raDeg",
294 declColName="decDeg",
295 tableName="tcs_helper_ned_query_history",
296 dbConn=self.cataloguesDbConn,
297 log=self.log,
298 primaryIdColumnName="primaryId",
299 dbSettings=self.settings["database settings"]["static catalogues"]
300 )
302 self.log.debug('completed the ``_update_ned_query_history`` method')
303 return None
305 def _download_ned_source_metadata(
306 self):
307 """*Query NED using the names of the NED sources in our local database to retrieve extra metadata*
309 *Usage:*
311 ```python
312 stream._download_ned_source_metadata()
313 ```
314 """
315 self.log.debug('starting the ``_download_ned_source_metadata`` method')
317 self.dbTableName = "tcs_cat_ned_stream"
319 total, batches = self._count_ned_sources_in_database_requiring_metadata()
321 print(
322 "%(total)s galaxies require metadata. Need to send %(batches)s batch requests to NED." % locals())
324 self.log.info(
325 "%(total)s galaxies require metadata. Need to send %(batches)s batch requests to NED." % locals())
327 totalBatches = self.batches
328 thisCount = 0
330 # FOR EACH BATCH, GET THE GALAXY IDs, QUERY NED AND UPDATE THE DATABASE
331 # THEN RECOUNT TO DETERMINE IF THERE ARE REMAINING SOURCES TO GRAB
332 # METADATA FOR
333 while self.total:
334 thisCount += 1
335 self._get_ned_sources_needing_metadata()
336 self._do_ned_namesearch_queries_and_add_resulting_metadata_to_database(
337 thisCount)
338 self._count_ned_sources_in_database_requiring_metadata()
340 self.log.debug(
341 'completed the ``_download_ned_source_metadata`` method')
342 return None
344 def _get_ned_sources_needing_metadata(
345 self):
346 """*Get the names of 50000 or less NED sources that still require metabase in the database*
348 **Return**
350 - ``len(self.theseIds)`` -- the number of NED IDs returned
353 *Usage:*
355 ```python
356 numberSources = stream._get_ned_sources_needing_metadata()
357 ```
358 """
359 self.log.debug(
360 'starting the ``_get_ned_sources_needing_metadata`` method')
362 tableName = self.dbTableName
364 # SELECT THE DATA FROM NED TABLE
365 sqlQuery = u"""
366 select ned_name from %(tableName)s where raDeg is null and (download_error != 1 or download_error is null) limit 50000;
367 """ % locals()
368 sqlQuery = u"""
369 select ned_name from %(tableName)s where (download_error != 1 or download_error is null) limit 50000;
370 """ % locals()
371 rows = readquery(
372 log=self.log,
373 sqlQuery=sqlQuery,
374 dbConn=self.cataloguesDbConn,
375 quiet=False
376 )
378 self.theseIds = []
379 self.theseIds[:] = [r["ned_name"].replace('"', '\\"') for r in rows]
381 self.log.debug(
382 'completed the ``_get_ned_sources_needing_metadata`` method')
384 return len(self.theseIds)
386 def _do_ned_namesearch_queries_and_add_resulting_metadata_to_database(
387 self,
388 batchCount):
389 """*Query NED via name searcha and add result metadata to database*
391 **Key Arguments**
393 - ``batchCount`` - the index number of the batch sent to NED (only needed for printing to STDOUT to give user idea of progress)
396 *Usage:*
398 ```python
399 numberSources = stream._do_ned_namesearch_queries_and_add_resulting_metadata_to_database(batchCount=10)
400 ```
401 """
402 self.log.debug(
403 'starting the ``_do_ned_namesearch_queries_and_add_resulting_metadata_to_database`` method')
405 # ASTROCALC UNIT CONVERTER OBJECT
406 converter = unit_conversion(
407 log=self.log
408 )
409 tableName = self.dbTableName
411 # QUERY NED WITH BATCH
412 totalCount = len(self.theseIds)
413 print("requesting metadata from NED for %(totalCount)s galaxies (batch %(batchCount)s)" % locals())
414 # QUERY THE ONLINE NED DATABASE USING NEDDY'S NAMESEARCH METHOD
415 search = namesearch(
416 log=self.log,
417 names=self.theseIds,
418 quiet=True
419 )
420 results = search.get()
421 print("results returned from ned -- starting to add to database" % locals())
423 # CLEAN THE RETURNED DATA AND UPDATE DATABASE
424 totalCount = len(results)
425 count = 0
426 sqlQuery = ""
427 dictList = []
428 for thisDict in results:
429 thisDict["tableName"] = tableName
430 count += 1
431 for k, v in list(thisDict.items()):
432 if not v or len(v) == 0:
433 thisDict[k] = "null"
434 if k in ["major_diameter_arcmin", "minor_diameter_arcmin"] and (":" in v or "?" in v or "<" in v):
435 thisDict[k] = v.replace(":", "").replace(
436 "?", "").replace("<", "")
437 if isinstance(v, ("".__class__, u"".__class__)) and '"' in v:
438 thisDict[k] = v.replace('"', '\\"')
439 if "Input name not" not in thisDict["input_note"] and "Same object as" not in thisDict["input_note"]:
440 try:
441 thisDict["raDeg"] = converter.ra_sexegesimal_to_decimal(
442 ra=thisDict["ra"]
443 )
444 thisDict["decDeg"] = converter.dec_sexegesimal_to_decimal(
445 dec=thisDict["dec"]
446 )
447 except:
448 name = thisDict["input_name"]
449 self.log.warning(
450 "Could not convert the RA & DEC for the %(name)s NED source" % locals())
451 continue
452 thisDict["eb_v"] = thisDict["eb-v"]
453 thisDict["ned_name"] = thisDict["input_name"]
454 row = {}
455 for k in ["redshift_quality", "redshift", "hierarchy", "object_type", "major_diameter_arcmin", "morphology", "magnitude_filter", "ned_notes", "eb_v", "raDeg", "radio_morphology", "activity_type", "minor_diameter_arcmin", "decDeg", "redshift_err", "ned_name"]:
456 if thisDict[k] == "null":
457 row[k] = None
458 else:
459 row[k] = thisDict[k]
461 if '"' in thisDict["ned_name"]:
462 print(thisDict)
463 print(thisDict["ned_name"])
464 sys.exit(0)
466 dictList.append(row)
468 self.add_data_to_database_table(
469 dictList=dictList,
470 createStatement="""SET SESSION sql_mode="";"""
471 )
473 theseIds = ("\", \"").join(self.theseIds)
475 sqlQuery = u"""
476 update %(tableName)s set download_error = 1 where ned_name in ("%(theseIds)s");
477 """ % locals()
478 writequery(
479 log=self.log,
480 sqlQuery=sqlQuery,
481 dbConn=self.cataloguesDbConn,
482 )
484 print("%(count)s/%(totalCount)s galaxy metadata batch entries added to database" % locals())
485 if count < totalCount:
486 # Cursor up one line and clear line
487 sys.stdout.write("\x1b[1A\x1b[2K")
489 sqlQuery = u"""
490 update tcs_helper_catalogue_tables_info set last_updated = now() where table_name = "%(tableName)s"
491 """ % locals()
492 writequery(
493 log=self.log,
494 sqlQuery=sqlQuery,
495 dbConn=self.cataloguesDbConn,
496 )
498 self.log.debug(
499 'completed the ``_do_ned_namesearch_queries_and_add_resulting_metadata_to_database`` method')
500 return None
502 def _count_ned_sources_in_database_requiring_metadata(
503 self):
504 """*Count the sources in the NED table requiring metadata*
506 **Return**
508 - ``self.total``, ``self.batches`` -- total number of galaxies needing metadata & the number of batches required to be sent to NED
511 *Usage:*
513 ```python
514 totalRemaining, numberOfBatches = stream._count_ned_sources_in_database_requiring_metadata()
515 ```
516 """
517 self.log.debug(
518 'starting the ``_count_ned_sources_in_database_requiring_metadata`` method')
520 tableName = self.dbTableName
522 # sqlQuery = u"""
523 # select count(*) as count from %(tableName)s where raDeg is null and (download_error != 1 or download_error is null)
524 # """ % locals()
525 sqlQuery = u"""
526 select count(*) as count from %(tableName)s where (download_error != 1 or download_error is null)
527 """ % locals()
528 rows = readquery(
529 log=self.log,
530 sqlQuery=sqlQuery,
531 dbConn=self.cataloguesDbConn,
532 quiet=False
533 )
534 self.total = rows[0]["count"]
535 self.batches = int(old_div(self.total, 50000.)) + 1
537 if self.total == 0:
538 self.batches = 0
540 self.log.debug(
541 'completed the ``_count_ned_sources_in_database_requiring_metadata`` method')
542 return self.total, self.batches
544 # use the tab-trigger below for new method
545 # xt-class-method