Source code for pymepps.accessor.spatial

#!/bin/env python
# -*- coding: utf-8 -*-
# """
# Created on 09.12.16
#
# Created for pymepps
#
# @author: Tobias Sebastian Finn, tobias.sebastian.finn@studium.uni-hamburg.de
#
#     Copyright (C) {2016}  {Tobias Sebastian Finn}
#
#     This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
# """
# System modules
import logging
from collections import OrderedDict
import re
import datetime

# External modules
import xarray as xr
import numpy as np

# Internal modules
import pymepps
from .base import MetData
from pymepps.grid.builder import GridBuilder
from pymepps.loader.datasets.tsdataset import TSDataset
from pymepps.loader.filehandler.netcdfhandler import cube_to_series


logger = logging.getLogger(__name__)


[docs]@xr.register_dataarray_accessor('pp')
class SpatialAccessor(MetData):
    """
    The SpatialAccessor extends a xarray.DataArray for post-processing of
    meteorological numerical weather model data. The SpatialAccessor works
    mostly with grid and gridded data.
    """
    def __init__(self, data, grid=None):
        super().__init__(data)
        self._grid = None
        self.grid = grid

    def __str__(self):
        try:
            grid = str(self.grid)
        except TypeError:
            grid = str(None)
        name = "{0:s}({1:s})".format(self.__class__.__name__, self.data.name)
        return "{0:s}\n{1:s}\nGrid: {1:s}".format(name, '-'*len(name), grid)

    @property
    def grid(self):
        """
        The corresponding grid of this xarray.DataArray instance. This grid is
        used to interpolate/remap the data and to select the nearest grid
        point to a given longitude/latitude pair.
        """
        if self._grid is None:
            raise TypeError('This DataArray has no grid defined!')
        else:
            return self._grid

    @grid.setter
    def grid(self, grid):
        if grid is not None and not hasattr(grid, '_grid_dict'):
            raise TypeError('The given grid is not a valid defined grid type!')
        self._grid = grid

[docs]    def set_grid(self, grid=None):
        """
        Set the grid to the given grid and set the grid coordinates. It is
        assumed that the last n dimensions (n=1 for unstructured grid, n=2 all
        other grids) are the grid coordinates. Please make sure that this
        assumption is fulfilled!

        Parameters
        ----------
        grid : Grid or None, optional
            This grid is used to set the grid and the grid coordinates of the
            returned array. If this is None, the grid of this DataArray instance
            is used. Default is None.

        Returns
        -------
        gridded_array : xarray.DataArray
            The DataArray with the grid coordinates and the grid.

        Raises
        ------
        ValueError
            A ValueError is raised if the grid of this instance is used and not
            grid set.
        """
        if grid is None and self._grid is not None:
            grid = self.grid
        elif grid is None:
            raise ValueError('The grid of this DataArray is used and not set!')
        coord_names = grid.get_coord_names()
        rename_dict = {new: old for new, old in zip(
            self.data.dims[-grid.len_coords:], coord_names,)}
        gridded_array = self.data.rename(rename_dict)
        new_coordinates = grid.get_coords()
        for coord in coord_names:
            gridded_array.coords[coord] = new_coordinates[coord]
        gridded_array.pp.grid = grid
        gridded_array = gridded_array.pp.check_data_coordinates(gridded_array)
        return gridded_array

[docs]    def check_data_coordinates(self, item):
        """
        Check if items grid coordinates shape is the same as those of the grid.

        Parameters
        ----------
        item: xarray.DataArray
            Instance to test for type and grid dimension length.

        Returns
        -------
        item: xarray.DataArray
            The checked item.

        Raises
        ------
        TypeError:
            The grid is not set.
        ValueError:
            The given item has not the same last coordinates as the grid.
        """
        item_grid_shape = item.shape[-self.grid.len_coords:]
        coords_equal = all(
            [np.equal(item, grid) for item, grid
             in zip(item_grid_shape, self.grid.shape)])
        if not coords_equal:
            raise ValueError('The item {0:s} has not the right last dimensions.'
                             'They need to be the same as the grid!')
        return item

[docs]    def normalize_coords(self, runtime=None, ensemble='det', validtime=None,
                         height=None):
        """
        Normalize the coordinates of the DataArray. The number, order and names
        of the coordinates are normalized. The number of coordinates will be
        four to six, depending if the DataArray is a merged multi-variable
        DataArray and the number of grid coordinates. The values of the added
        coordinates is set to the given values or will be None as filling value.
        The order and name of the DataArray will be:

        - (variable) (Only if the DataArray is a multi-variable DataArray.
          This is the variable name)
        - runtime (The analysis time of the model. The model is started at this
          time. The runtime is np.datetime64 as type)
        - ensemble (The ensemble member of the model.)
        - validtime (The lead time of the model. The model is valid for this
          times. The validtime timedelta to the runtime.)
        - height (The height information of the model.)
        - first grid coordinate
        - (second grid coordinate) (Only if the grid is not an unstructured
          grid)

        Parameters
        ----------
        runtime : datetime.datetime, np.datetime64  or None, optional
            The runtime of the model. The runtime will be converted to
            np.datetime64[ns] if it is not already this type. Default is None.
        ensemble : int or str, optional
            The ensemble member of the model. An integer is indicating the
            member number, with zero as control run. Default is 'det'.
        validtime : datetime.datetime, np.datetiime64, np.timedelta or None,
                optional
            The validtime of the model. The validtime is converted to
            np.timedelta. Default is None.
        height : int, str or None, optional
            The height of the model. Default is None.

        Returns
        -------
        normalized_array : xr.DataArray
            The DataArray with normalized coordinates.
        """
        arg_dict = locals()
        coord_dict = OrderedDict(
            height=dict(
                approx=['height', 'surf', 'sig', 'lev', 'press', 'alti'],
                exact=['height', ]
            ),
            validtime=dict(
                approx=['lead', ],
                exact=['validtime', 'time']
            ),
            ensemble=dict(
                approx=['ens', 'mem', 'num'],
                exact=['ensemble', ]
            ),
            runtime=dict(
                approx=['ana', 'ref', 'run'],
                exact=['runtime']
            )
        )
        normalized_array = self.data
        for key in coord_dict.keys():
            coord_name = self._get_coord_name(data=normalized_array,
                                              variants=coord_dict[key])
            if coord_name is None:
                normalized_array = self._create_coord(coord=key,
                                                      value=arg_dict[key],
                                                      data=normalized_array)
            else:
                normalized_array = self._rename_coord(orig=coord_name, to=key,
                                                      data=normalized_array)
        normalized_array = self._get_normalized_order(normalized_array)
        normalized_array = self._transform_datetime(normalized_array)
        normalized_array = self._validtime_to_timedelta(normalized_array)
        try:
            normalized_array = normalized_array.pp.set_grid(self.grid)
        except TypeError:
            pass
        return normalized_array

    @staticmethod
    def _get_coord_name(data, variants):
        """
        Check if the coordinate name variants is any dimensions within the
        DataArray.

        Parameters
        ----------
        data : xarray.DataArray
            The coordinate is searched within the coordinates of this DataArray.
        variants : dict(str, list(str)) or list(str)
            These variants are checked within the dimensions of the DataArray.
            If variants is a dict, it needs exact and approx as key with a
            sublist of variants. The exact list is used to check the exact
            dimension name. The approx list is used to check if the variant is
            within the name of a dimension. If variants is a list of strings
            the list values are used for exact matching.

        Returns
        -------
        str or None
            The matched dimension is returned. If the return value is None, no
            matching dimension was found.
        """
        if isinstance(variants, (tuple, list)):
            variants = dict(exact=variants, approx=[])
        data_dims = [re.sub('[^a-zA-Z]+', '', d) for d in data.dims]
        for k, dim in enumerate(data_dims):
            for variant in variants['exact']:
                if variant == dim:
                    return data.dims[k]
            for variant in variants['approx']:
                if variant in dim:
                    return data.dims[k]
        return None

    @staticmethod
    def _get_normalized_order(data):
        if 'variable' in data.dims:
            normalized_order = ['variable']
        else:
            normalized_order = []
        normalized_order.extend(['runtime', 'ensemble', 'validtime', 'height'])
        normalized_order.extend([
            dim for dim in data.dims if dim not in normalized_order
        ])
        normalized_array = data.transpose(*normalized_order)
        return normalized_array

    @staticmethod
    def _transform_datetime(data):
        """
        Transform the datetime dimensions of the given data to
        np.datetime64[ns].

        Parameters
        ----------
        data : xr.DataArray
            The dimensions of this DataArray are used for the transformation.

        Returns
        -------
        transformed_data : xr.DataArray
            The DataArray with the transformed time coordinates.
        """
        transformed_data = data.copy()
        dims_to_transform = [
            dim for dim in data.dims
            if isinstance(data[dim].values[0],
                          (datetime.datetime, np.datetime64))]
        for dim in dims_to_transform:
            transformed_data[dim] = transformed_data[dim].astype(
                'datetime64[ns]')
        return transformed_data

    @staticmethod
    def _create_coord(data, coord, value=None,):
        """
        Create a coordinate within the given DataArray with given value.

        Parameters
        ----------
        data : xr.DataArray
            The DataArray is used to create and add the coordinate.
        coord : str
            The name of the coordinate.
        value : obj or None, optional
            The value of the coordinate. None is used as filling value. Default
            is None.

        Returns
        -------
        coordinated_array : xr.DataArray
            The DataArray with the added coordinate.

        Raises
        ------
        ValueError
            A coordinate with the same name already exists within the
            coordinates dict or the dimension list of the DataArray.
        """
        coordinated_array = data.expand_dims(coord)
        coordinated_array[coord] = np.array((value,))
        return coordinated_array

    @staticmethod
    def _rename_coord(data, orig, to):
        """
        Rename a given coordinate to a new name.

        Parameters
        ----------
        data : xr.DataArray
            The DataArray is used to rename the coordinate.
        orig : str
            The name of the original coordinate. The coordinate needs to be
            within the DataArray coordinates.
        to : str
            The coordinate is renamed to this name.

        Returns
        -------
        renamed_array : xr.DataArray
            The DataArray with the renamed coordinate.
        """
        renamed_array = data.rename({orig: to})
        return renamed_array

    @staticmethod
    def _validtime_to_timedelta(data, validtime='validtime', runtime='runtime'):
        """
        Transform the validtime coordinate from a np.datetime64 coordinate to a
        np.timedelta coordinate if also the runtime coordinate is a
        np.datetime64 coordinate. The timedelta is create with
        :math:`validtime-runtime`.

        Parameters
        ----------
        data : xr.DataArray
            The DataArray is used to transform the validtime coordinate.
        validtime : str, optional
            Name of the validtime coordinate. Default is validtime.
        runtime : str, optional
            Name of the runtime coordinate. Default is runtime.

        Returns
        -------
        transformed_array : xr.DataArray
            The DataArray with the transformed validtime coordinate.
        """
        transformed_array = data.copy()
        runtime_values = data[runtime].values
        validtime_values = data[validtime].values
        if np.issubdtype(runtime_values.dtype, np.datetime64) and \
                np.issubdtype(validtime_values.dtype, np.datetime64):
            transformed_array[validtime] = validtime_values - runtime_values
        return transformed_array

[docs]    def merge(self, *items):
        """
        The merge routine could be used to merge this SpatialData instance with
        other instances. The merge creates a new merge dimension, named after
        the variable names. The grid of this instance is used as merged grid.

        Parameters
        ----------
        items : xarray.DataArray
            The items are merged with this xarray.DataArray instance. The grid
            dimensions have to be same as the grid.

        Returns
        -------
        merged_array : xarray.DataArray
            The DataArray instance with the merged data.
        """
        update_data = [self.check_data_coordinates(self.data), ]
        update_data += [self.check_data_coordinates(item) for item in items]
        dataset_data = [
            item.to_dataset('variable') if 'variable' in item.coords else item
            for item in update_data]
        merged_data = xr.merge(dataset_data).to_array(name='merged_array')
        merged_data.pp.grid = self.grid
        return merged_data

[docs]    def update(self, *items):
        """
        The update routine could be used to update the DataArray, based on other
        DataArrays. There are some assumptions done:
            1. The used data to update this DataArray instance has the same
            grid coordinates as this instance.
            2. Beginning from the left the given items are used to update the 
            data. Such that intersection problems are resolved in favor of the
            newest data.

        Parameters
        ----------
        items : xarray.DataArray
            The items are merged with this xarray.DataArray instance. The grid
            dimensions have to be same as the grid.

        Returns
        -------
        merged_array : xarray.DataArray
            The DataArray instance with the updated data.
        """
        update_data = [self.check_data_coordinates(self.data), ]
        update_data += [self.check_data_coordinates(item) for item in items]
        stack_dims = [dim for dim in self.data.dims
                      if dim not in self.grid.get_coord_names()]
        logger.debug('Stack dimension names: {0}'.format(stack_dims))
        stacked_data = (d.stack(merge=stack_dims) for d in update_data)
        try:
            concated_array = xr.concat(stacked_data, dim='merge')
        except (ValueError, TypeError) as e:
            raise e.__class__("The concatenation doesn't working, for "
                              'please see above for the reasons!')
        resolving_indexes = ~concated_array.indexes['merge'].duplicated(
            keep='last')
        logger.debug('Number of resolving indexes: {0:d}/{1:d}'.format(
            len(resolving_indexes), len(concated_array.indexes['merge'])))
        resolved_array = concated_array[..., resolving_indexes]
        unstacked_array = resolved_array.unstack('merge')
        updated_array = unstacked_array.transpose(*self.data.dims)
        updated_array.pp.grid = self.grid
        return updated_array

[docs]    def merge_analysis_timedelta(self, analysis_axis='runtime',
                                 timedelta_axis='validtime'):
        """
        The analysis time axis will be merged with the valid time axis,
        which should be given as timedelta. The merged time coordinate is called
        time and will be the first coordinate .

        Parameters
        ----------
        analysis_axis : str, optional
            The analysis time axis name. This axis will be used as basis for the
            valid time. Default is runtime.
        timedelta_axis : str, optional
            The time delta axis name. This axis should contain the difference to
            the analysis time.

        Returns
        -------
        merged_array : xarray.DataArray
            The DataArray with the merged analysis and timedelta coordinate.
        """
        stacked_data = self.data.stack(
            time=[analysis_axis, timedelta_axis])
        stacked_data.coords['time'] = [
            val[0]+val[1] for val in stacked_data.time.values]
        dims_to_transpose = ['time', ] + list(stacked_data.dims[:-1])
        merged_data = stacked_data.transpose(*dims_to_transpose)
        try:
            merged_data.pp.grid = self.grid
        except TypeError:
            pass
        return merged_data

[docs]    def to_pandas(self, lonlat=None):
        """
        Transform the DataArray to Pandas based on given coordinates. If
        coordinates are given this method selects the nearest neighbour grid
        point to this coordinates. The data is flatten to a 2d-DataFrame with
        the time as row axis.

        Parameters
        ----------
        lonlat : tuple(float, float) or None
            The nearest grid point to this coordinates (longitude, latitude) is
            used to generate the pandas data. If lonlat is None no
            coordinates will be selected and the data is flatten. If the
            horizontal grid coordinates are not a single point it is recommended
            to set lonlat.

        Returns
        -------
        extracted_data : pandas.Series or pandas.DataFrame
            The extracted pandas data. The data is based on either a
            Series (1 Column) or Dataframe (multiple column) depending on the
            dimensions.
        """
        if isinstance(lonlat, (list, tuple)) and len(lonlat) == 2:
            extracted_data = self.grid.get_nearest_point(data=self.data,
                                                         coord=reversed(lonlat))
            dims_wo_grid = [dim for dim in self.data.dims
                            if dim not in self.grid.get_coord_names()]
            coords = {dim: self.data.coords[dim] for dim in dims_wo_grid}
            cube = xr.DataArray(
                extracted_data,
                coords=coords,
                dims=dims_wo_grid,
            )
        else:
            cube = self.data
        series_data = cube_to_series(cube, self.data.name)
        ts_ds = TSDataset(None, data_origin=self, lonlat=lonlat)
        extracted_data = ts_ds.data_merge(series_data, self.data.name)
        return extracted_data

[docs]    def remapnn(self, new_grid):
        """
        Remap the horizontal grid with a nearest neighbour approach to a given
        new grid.

        Parameters
        ----------
        new_grid : Child instance of Grid
            The data is remapped to this grid.

        Returns
        -------
        remapped_array : xarray.DataArray
            The xarray.DataArray with the replaced grid.
        """
        remapped_array = self.grid.interpolate(self.data, new_grid, order=0)
        remapped_array.pp.grid = new_grid
        return remapped_array

[docs]    def remapbil(self, new_grid):
        """
        Remap the horizontal grid with a bilinear approach to a given
        new grid.

        Parameters
        ----------
        new_grid : Child instance of Grid
            The data is remapped to this grid.

        Returns
        -------
        remapped_array : xarray.DataArray
            The xarray.DataArray with the replaced grid.
        """
        remapped_array = self.grid.interpolate(self.data, new_grid, order=1)
        remapped_array.pp.grid = new_grid
        return remapped_array

[docs]    def selpoint(self, lonlat):
        """
        Select a longitude, latitude point within this DataArray. A new lonlat
        grid with a single point is created.

        Parameters
        ----------
        lonlat : tuple(float)
            The longitude and latitude point as degree. The nearest neighbour
            point to this given coordinate pair is used.

        Returns
        -------
        sliced_array : xarray.DataArray
            The sliced data array with the data for the nearest neighbour point
            to the given coordinates.
        """
        sliced_array = self.grid.get_nearest_point(self.data, reversed(lonlat))
        grid_dict = {
            'gridtype': 'lonlat',
            'xlongname': 'longitude',
            'xname': 'lon',
            'xunits': 'degrees',
            'ylongname': 'latitude',
            'yname': 'lat',
            'yunits': 'degrees',
            'xsize': 1,
            'ysize': 1,
            'xvals': [lonlat[0], ],
            'yvals': [lonlat[1], ]
        }
        sliced_grid = GridBuilder(grid_dict).build_grid()
        grid_coords = sliced_grid.get_coord_names()
        dim_order = list(sliced_array.dims)+list(grid_coords)
        sliced_array = sliced_array.expand_dims(grid_coords)
        sliced_array = sliced_array.transpose(*dim_order)
        sliced_array = sliced_array.pp.set_grid(sliced_grid)
        return sliced_array

[docs]    def sellonlatbox(self, lonlatbox):
        """
        This DataArray instance is sliced by given lonlatbox. A new grid is
        created and set based on the sliced coordinates.

        Parameters
        ----------
        lonlatbox : tuple(float)
            The longitude and latitude box with four entries as degree. The
            entries are handled in the following way:
                (left/west, top/north, right/east, bottom/south)

        Returns
        -------
        sliced_array : xarray.DataArray
            The sliced data array with the new grid.

        Notes
        -----
        For some grids the new grid is based on an UnstructuredGrid, due to
        technical limitations.
        """
        # TODO: Normalize lon lat values.
        sliced_array, sliced_grid = self.grid.lonlatbox(self.data, lonlatbox)
        sliced_array.pp.grid = sliced_grid
        return sliced_array

[docs]    def grid_to_attrs(self):
        grid_array = self.data.copy()
        grid_attr = {'ppgrid_{0:s}'.format(k): self.grid._grid_dict[k]
                     for k in self.grid._grid_dict}
        grid_array.attrs.update(grid_attr)
        return grid_array

[docs]    def save(self, save_path):
        """
        Save the DataArray and the grid as attributes together. The grid
        attributes are used by the load method to recreate the grid, but it is
        also possible to load the data with the normal xarray load functions.

        Parameters
        ----------
        save_path : str
            The path where the netcdf file should be saved.
        """
        try:
            save_array = self.grid_to_attrs()
        except TypeError:
            save_array = self.data.copy()
        save_array.to_netcdf(save_path)

[docs]    @staticmethod
    def load(load_path):
        """
        Load a NetCDF-based previously saved xarray.DataArray instance. If the
        NetCDF file has grid attributes they will be decoded as new grid.

        Parameters
        ----------
        load_path : str
            The path to the saved xarray.DataArray instance.

        Returns
        -------
        loaded_array : xarray.DataArray
            The loaded DataArray instance. If a grid could be created it will be
            set to the DataArray instance.
        """
        loaded_array = xr.open_dataarray(load_path)
        grid_attrs = [attr for attr in loaded_array.attrs
                      if attr[:7] == 'ppgrid_']
        grid_dict = {attr[7:]: loaded_array.attrs[attr] for attr in grid_attrs}
        try:
            loaded_grid = GridBuilder(grid_dict).build_grid()
            loaded_array.pp.grid = loaded_grid
            for key in grid_attrs:
                loaded_array.attrs.pop(key, None)
        except (KeyError, ValueError):
            pass
        return loaded_array