Source code for mspasspy.db.spectrumdb

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from abc import ABC, abstractmethod
from mspasspy.ccore.seismic import PowerSpectrum
from mspasspy.ccore.utility import MsPASSError, ErrorSeverity
from mspasspy.db.schema import DatabaseSchema, MetadataSchema
from mspasspy.db.client import DBClient
from bson import ObjectId
import pickle


[docs] class BasicObjectDatabase(ABC): """ Abstract base class for database handle to save a single object type. This is a base class for an alternative interface to the MsPASS database handle used as a core component of the framework. The main use of this alternative interface is assumed to be as a way to cleanly add additional data objects to the core object of MsPASS. For example, this file contains a concrete implementation of this class to save power spectrum estimates in a particular MongoDB collection. Users should use that as an example for how to utilize this approach for saving other types of data that MsPASS standard objects (TimeSeries, Seismogram, TimeSeriesEnsemble, and SeismogramEnsemble). A key point of this class it to create a simple, flexible interface to build a MongoDB collection that would store one and only one type of data object. An implementation may choose to implement a schema if they wish using the schema mechanism of MsPASS. Research uses for different objects that are specialized to one user or group may find this class a useful base to build upon. """ def __init__( self, name, type_list, *args, db_schema=None, md_schema=None, **kwargs, ): """ Base class constructor. Most subclasses should normally calls this constructor as part of the __init__ method of the subclass. This constructor creates an instance of the MsPASS Database class internally (self.db). Subclasses should define what collections(s) this handle should reference. A schema can optionally be loaded through the two schema arguments but is not required. That is, the decision on whether or not to enforce a schema definition with MongoDB is an implementation detail for the type(s) that handle suports. Note the required `type_list` arg is a list of python types a concrete implementation will support. The model is used it that the a subclass implementation could call this base class constructor with the list of types it supports. An appropriate way to think of `type_list` is a virtual attribute. :param name: all database systems use the concept of a string that defines a particular instance of a database stored in that system. Subclasses should call this constructor to set that name. :type name: string :param type_list: Because this class is aimed at supporting management of one or more data types we neee a clean way to define what those types are. This argument serves that purpose. It should contain a list of python types that can be tested with isintance to verify data being handled are of the right type. (see above) :type type_list: list of python types that will work in a loop of isinstance tests. :param db_schema: A concrete instance of this class may want to impose a specific schema on the attributes to be stored in the database. This defines the schema to use for attributes stored externally in the database. It sets the class variable self.database_schema. Note a schema is considered optional and can be turned off by setting this argument to the magic string "DO_NOT_LOAD". :type db_schema: Can be one of three type: 1. mspasspy.db.schema.DatabaseSchema - in this case the class content is copied to self.database_schema. 2. A string defining a specific schema by a keyword name. That name is assume to match a yaml file name in the mspass data/yaml directory. e.g. the default mspass schema is "mspass" defined with the file data/yaml/mspass.yaml. A special case is the magic name "DO_NOT_LOAD". If that name appears no schema is loaded and self.database_schema is set to None. 3. None - this is the default and used to load the default schema file ("mspass")/ :param md_schema: A concrete instance of this class may want to impose a schema on attributes loaded as Metadata to a MsPASS data object. Most useful for type enforcement. It defines the contents of the class attribute self.metadata_schema. Note a schema is considered optional and can be turned off by setting this argument to the magic string "DO_NOT_LOAD". :type md_schema: Can be one of three type: 1. mspasspy.db.schema.MetadataSchema - in this case the class content is copied to self.metadata_schema. 2. A string defining a specific schema by a keyword name. That name is assume to match a yaml file name in the mspass data/yaml directory. e.g. the default mspass schema is "mspass" defined with the file data/yaml/mspass.yaml. A special case is the magic name "DO_NOT_LOAD". If that name appears no schema is loaded and self.metadata_schema is set to None. 3. None - this is the default and used to load the default schema file ("mspass")/ """ do_not_load_keyword = "DO_NOT_LOAD" self.name = name dbclient = DBClient() self.db = dbclient.get_database(self.name) self.type_list = type_list if isinstance(db_schema, DatabaseSchema): self.database_schema = db_schema elif isinstance(db_schema, str): if db_schema == do_not_load_keyword: self.database_schema = None else: self.database_schema = DatabaseSchema(db_schema) else: self.database_schema = DatabaseSchema() if isinstance(md_schema, MetadataSchema): self.metadata_schema = md_schema elif isinstance(md_schema, str): if md_schema == do_not_load_keyword: self.metadata_schema = None else: self.metadata_schema = MetadataSchema(md_schema) else: self.metadata_schema = MetadataSchema()
[docs] def data_valid(self, d) -> bool: """ Tests if input datum d has a type supported by this handle. Returns a True if the answer is yes and false if the answer is no. Callers should handle the condition of false that would almost alway be an error. """ for typ in self.type_list: if isinstance(d, typ): return True return False
[docs] @abstractmethod def read_data(self): """ Read one datum. Concrete implementation must implement this method. It would normally contain some identifier in the arg list to select one an d only one entry from the database. It would then return run an algorithm to construct and return the atomic data with wich this class is associated. """ pass
[docs] @abstractmethod def save_data(self, d): """ Save one datum. Concrete implementations must implement this method. It will save datum d by whatever scheme is used for the implementation. """ pass
[docs] @abstractmethod def verify(self): """ Verify the validity of the data with which this handle is associated. Any real database needs a way to verify the contents are "clean" as defined by the needs of the system. This method should implement whatever algorithm is appropriate to verify the contents are valid in the sense that read_data operations will not fail or some more elaborate requirement is satisified. Most implementations will want to use a list of verify tests that implement different algorithms that define what "clean" means. Concrete implementations must implement this method. """ pass
[docs] class SpectrumDatabase(BasicObjectDatabase): """ Specialized database handle to manage PowerSpectrum data. Use this handle to read and write PowerSpectrum objects to a MongoDB database. This interface is much simpler than the standard MsPASS database. The class has only a basic reader and writer and a simple verify method required by the abstract base class from which it is derived. """ def __init__( self, name, *args, collection="PowerSpectrum", **kwargs, ): """ Constructor for this database handle. Note because this class inherits pymongo.database.Database. You can pass arguments to this constructor recognized by the base class constructor for pymongo's handle like "read_concern" or "write_concern" and they will be passed to the pymongo constructor by the standard python mechanism of *args and **kwargs. :param name: MongoDB database name to use. :type name: string :param collection: optional alternative collection name to use for reads and writes (default is "PowerSpectrum") :type collection: string """ type_list = [PowerSpectrum] BasicObjectDatabase.__init__( name, type_list, db_schema="DO_NOT_LOAD", md_schema="DO_NOT_LOAD" ) self.collection = self[collection]
[docs] def save_data(self, datum, exclude=None, metadata2save=None, format="pickle"): """ Saves a single PowerSpectrum object defined through arg0. Default dumps all metadata elements to PowerSpectrum collection document and saves pickled version of datum with the key "serialized_data". :param datum: PowerSpectrum to save. The method will throw a MsPASSError if this is not a PowerSpectrum object. If the datum is marked dead it will be silently skipped. :param exclude: list of Metadata keys to not save to the saved document. Default is None which means all attributes will be saved. :param metadata2save: list Metadata keys to be saved. If not None (default) only the data fetched with these keys will be saved to the document created for this object. If the key is not actually found in the Metadata area of datum it will be silently ignored. :param format: output format of the object. Currently the only accepted value is the default of "pickle". The default format pickles the input datum and saves the result with the key "serialized_data". """ if format != "pickle": raise MsPASSError( "SpectrumDatabase.save_data: format can currently only be pickle", ErrorSeverity.Fatal, ) if self.data_valid(): if datum.dead(): return datum if metadata2save: doc = dict() for key in metadata2save: # if running this mode silently ignore any # metadata defined in the keep list but not defined # in the datum. Questionable behavior if datum.is_defined(key): val = datum[key] doc[key] = val else: doc = dict(datum) if exclude: for key in exclude: doc.pop(key) doc["serialized_data"] = pickle.dumps(datum) recid = self.collection.insert_one(doc).inserted_id return recid else: message = "SpectrumDatabase.save_data: illegal data type for arg0. Found type={typ} - only support PowerSpectrum".format( typ=type(datum) ) raise MsPASSError(message, ErrorSeverity.Fatal)
[docs] def read_data( self, id_or_doc, required=None, override=None, ): """ Reads one PowerSpectrum using an object id either directly or indirectly via an input MongoDB document. Because this implementation uses pickle to restore the PowerSpectrum object from a serialized form it may be useful to verify the content of the restored datum created by pickle has attributes that are the same as the database. The can be necessary if the database was edited after a datum of interest was saved. The "required" and "override" optional arguments are used for that purpose. :param id_or_doc: as the name implies this required argument must be either a MongoDB ObjectId class or something that at least acts like a MongoDB document. An id is used directly to generate a query. If the argument is not an ObjectId the method attempts to fetch an attribute with the stock key "_id". If that key is not found this method will throw an exception. :param required: list of keys (strings) that will always be fetched from the document retrieved in the query. The content retrieved will always override any value that might have been stored with the serialized version of the object. Use this feature to fix attributes repaired or added after a datum was saved. :param overide: list of keys (strings) that should be fetched from the document retrieved in the query and pushed to the constructed object. Use this feature to fix attributes repaired or added after a datum was saved. It differs from "required" as the attribute is treated as optional. i.e. it a value isn't found for a particular key it is simply not posted. """ if isinstance(id_or_doc, ObjectId): oid = id_or_doc else: oid = id_or_doc["_id"] doc = self.collection.find_one({"_id": oid}) if doc: datakey = "serialized_data" if datakey in doc: datum = pickle.loads(doc[datakey]) if required: for key in required: datum[key] = doc[key] if override: for key in override: if key in doc: datum[key] = doc[key] else: message = "SpectrumDatabase.read_data: missing required key=serialized_data - expected to contain datum serialized with pickle" raise MsPASSError(message, ErrorSeverity.Fatal) return datum else: message = "SpectrumDatabase.read_data: no document with ObjectId={oid} was found in PowerSpectrum collection".format( oid=str(oid) ) raise MsPASSError(message, ErrorSeverity.Invalid)
[docs] def verify(self, query=None, required=None): """ Scans PowerSpectrum collection to verify the contents. An optional MongoDB query can be passed to scan a limited subset. You can also pass a list of required keys that must be present in each document processed. The method always tests for the existence of the special key "serialized_data" that is used to store a pickled copy of the datum. :param query: optional pymongo query (python dict) to apply the Power Spectrum collection. Default scans the entire collection. :param required: optional list of keys every document scan is expected to contain. :return: tuple with two integers. Component 0 will contain the number of documents scanned and component 1 will contain the number the method considers valid. """ if query: cursor = self.collection.find(query) else: cursor = self.collection.find[{}] nprocessed = 0 nvalid = 0 testkey = "serialized_data" for doc in cursor: if testkey in doc: aok = True if required: for k in required: if k not in doc: aok = False break else: aok = False if aok: nvalid += 1 nprocessed += 1 return [nprocessed, nvalid]