Source code for pymepps.loader.datasets.metdataset

#!/bin/env python
# -*- coding: utf-8 -*-
# """
# Created on 10.12.16
#
# Created for pymepps
#
# @author: Tobias Sebastian Finn, tobias.sebastian.finn@studium.uni-hamburg.de
#
#     Copyright (C) {2016}  {Tobias Sebastian Finn}
#
#     This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
# """
# System modules
import logging
import abc
from functools import partial

# External modules
from tqdm import tqdm

# Internal modules
from pymepps.utilities import MultiThread

logger = logging.getLogger(__name__)


[docs]class MetDataset(object):
    """
    MetDataset is a base class for handling meteorolgical files.
    
    The normal workroutine would be:
        1) load the files (use of file handlers)
        2) select the important variables within the files (this object)
        3) post-process the variables (MetData/SpatialData/TSData object)

    Parameters
    ----------
    file_handlers : list of childs of FileHandler or None.
        The loaded file handlers. This instance load the variables. If the 
        file handlers are None then the dataset is used for conversion
        between Spatial and TSData.
    data_origin : optional
        The class where the data comes from. Normally this would be a
        model or a measurement site. If this is None, this isn't set.
        Default is None.
    processes : int, optional
        This number of processes is used to calculate time-consuming functions.
        For time-consuming functions a progress bar is shown. If the number of 
        processes is one the functions will be processed sequential. For more
        processes than one the multiprocessing module will be used.
        Default is 1.
    """
    def __init__(self, file_handlers, data_origin=None, processes=1):
        self._file_handlers = None
        self._multiproc = None
        self._processes = 1
        self.data_origin = data_origin
        self.file_handlers = file_handlers
        self.processes = processes
        self.__variables = self._initialize_variables()

    def __repr__(self):
        file_handlers = len(self.file_handlers)
        return '{0:s}({1:d})'.format(self.__class__.__name__, file_handlers)

    def __str__(self):
        file_handlers = len(self.file_handlers)
        var_names = self.var_names
        return '{0:s}\n{1:s}\nFile handlers: {2:d}\nVariables: {3:s}'.format(
            self.__class__.__name__, '-'*len(self.__class__.__name__),
            file_handlers, str(var_names))

    @property
    def processes(self):
        return self._processes

    @processes.setter
    def processes(self, nr_proc):
        self._multiproc = MultiThread(nr_proc)
        self._processes = nr_proc

    @staticmethod
    def _get_variables(file_handler):
        file_handler.open()
        var_names = list(file_handler.var_names)
        file_handler.close()
        return var_names

    def _initialize_variables(self):
        if self._file_handlers is None:
            return {}
        new_variables = {}
        mt = MultiThread(processes=self.processes)
        var_names_list = mt.map(self._get_variables, self._file_handlers,
                                flatten=False)
        for key, var_names in enumerate(var_names_list):
            for var_name in var_names:
                try:
                    new_variables[var_name].append(self._file_handlers[key])
                except KeyError:
                    new_variables[var_name] = [self._file_handlers[key], ]
        return new_variables

    @property
    def variables(self):
        """
        Return the variable names and the corresponding file handlers.
        """
        return self.__variables

    @property
    def file_handlers(self):
        if self._file_handlers is None:
            raise ValueError(
                'Do you really want to get a attribute, which is None?')
        return self._file_handlers

    @file_handlers.setter
    def file_handlers(self, handlers):
        if not isinstance(handlers, list) and handlers is not None:
            self._file_handlers = [handlers, ]
        else:
            self._file_handlers = handlers

    @property
    def var_names(self):
        """
        Get the available variable names.
        """
        return sorted(self.variables.keys())

[docs]    def select_by_pattern(self, pattern, return_list=True, **kwargs):
        """
        Method to select variables from this dataset by keywords. This method
        uses list comprehension to extract the variable names where the var_name
        pattern is within the variable name. If the variable names are found the
        variable is selected with the select method.

        Parameters
        ----------
        pattern : str
            The pattern for which should be searched.
        return_list : bool
            If the return value should be a list or a dictionary.
        kwargs : dict
            Additional parameters that are passed to the file handlers.

        Returns
        -------
        data_list : dict(str, SpatialData or TSData) or
                    list(SpatialData or TSData) or None
            The return value is a dict/list with SpatialData instances, one
            entry for every found variable name. If return_list is False, are
            the keys the variable names. If None is returned no variable with
            this pattern was found.
        """
        found_variables = [var for var in self.var_names if pattern in var]
        if not found_variables:
            logger.error('The pattern {0:s} is not found within the variable '
                         'names of this dataset. The available variable names'
                         ' are: {1:s}'.format(pattern, str(self.var_names)))
            return None
        else:
            if return_list:
                data_list = []
            else:
                data_list = {}
            logger.info(
                'Started to extract variables from file handlers with pattern '
                '{0:s}'.format(pattern))
            for var in found_variables:
                data = self.select(var, **kwargs)
                if return_list:
                    data_list.append(data)
                else:
                    data_list[var] = data
            return data_list

[docs]    def select(self, var_name, **kwargs):
        """
        Method to select a variable from this dataset. If the variable is find
        in more than one file or message, the method tries to find similarities
        within the metadata and to combine the data into one array, with
        several dimensions. This method could have a long running time, due to
        data loading and combination.

        Parameters
        ----------
        var_name : str
            The variable which should be extracted. If the variable is not
            found within the dataset there would be a value error exception.
        kwargs : dict
            Additional parameters that are passed to the file handlers.

        Returns
        -------
        extracted_data : SpatialData, TSData or None
            A child instance of MetData with the data of the selected
            variable as data. If None is returned the variable wasn't found
            within the list with possible variable names.
        """
        if var_name not in self.var_names:
            logger.error("The variable {0:s} is not in the available variable "
                         "names list. The possible variables are: {1:s}".
                         format(var_name, str(self.var_names)))
            return None
        num_file_handlers = len(self.variables[var_name])
        logger.info('Started select {0:s} from {1:d} files'.format(
            var_name, num_file_handlers))
        single_func = partial(self._get_file_data, var_name=var_name, **kwargs)
        data = self._multiproc.map(single_func, self.variables[var_name],
                                   flatten=True)
        logger.info('Extracted the data, now merge the data!')
        extracted_data = self.data_merge(data, var_name)
        return extracted_data

[docs]    def select_ds(self, include=None, exclude=None, **kwargs):
        """
        Extract the dataset data into a MetData instance. The include list is
        handled superior to the exclude list. If both lists are None all
        available variables are used.

        Parameters
        ----------
        include: iterable or None
            Within the include iterable are all variable names, which should be
            included into the MetData data. The list will be filtered for
            available  variable names. If no variable name is available a
            ValueError will be raised. If this is None, the include will be
            skipped and the exclude list will be used. Default is None.
        exclude: iterable or None
            If no include iterable is given, this exclude iterable is used.
            In this case, any available variable name, which is not within this
            list is used. If this iterable is also None, all available data
            variables are used to construct the MetData instance. Default is
            None.
        kwargs : dict
            Additional parameters that are passed to the file handlers.

        Returns
        -------
        extracted_data: TSData or SpatialData
            The extracted data instance.

        Raises
        ------
        ValueError:
            A ValueError is raised if no variable was selected from the dataset.
        """
        if isinstance(include, (tuple, list, set,)):
            extract_vars = [var for var in include if var in self.var_names]
            logger.debug(extract_vars)
            logger.info(
                'Filtered out the following variables, they are not within the '
                'dataset! {0:s}'.format(
                    str([var for var in include if var not in extract_vars])))
        else:
            if isinstance(exclude, (tuple, list, set,)):
                extract_vars = [var for var in self.var_names
                                if var not in exclude]
                logger.info(
                    'Filtered out the following variables, they are not within '
                    'the dataset! {0:s}'.format(str(
                        [var for var in self.var_names
                         if var not in extract_vars])))
            else:
                extract_vars = self.var_names
                logger.info('Used all available variables within this dataset')

        raw_data = []
        for var_name in extract_vars:
            num_file_handlers = len(self.variables[var_name])
            logger.info('Started select {0:s} from {1:d} files'.format(
                var_name, num_file_handlers))
            single_func = partial(self._get_file_data, var_name=var_name,
                                  **kwargs)
            data = self._multiproc.map(single_func, self.variables[var_name],
                                       flatten=True)
            raw_data.extend(self._multi_select_var(data, var_name))
            logger.info('Finished variable {0:s}'.format(var_name))
        logger.info('Extracted the data, now merge the data!')
        extracted_data = self.data_merge(raw_data, extract_vars[0])
        return extracted_data

    @abc.abstractmethod
    def _multi_select_var(self, data, var_name):
        pass

    @abc.abstractmethod
    def _get_file_data(self, file, var_name, **kwargs):
        pass

[docs]    @abc.abstractmethod
    def data_merge(self, data, var_name):
        """
        Method to merge the given data by given metadata into one data
        structure.
        """
        pass