Source code for exa.numerical

# -*- coding: utf-8 -*-
# Copyright (c) 2015-2016, Exa Analytics Development Team
# Distributed under the terms of the Apache License 2.0
'''
Trait Supporting Data Objects
###################################
The :class:`~exa.numerical.DataFrame` is an extension of the
:class:`~pandas.DataFrame` object. It provides additional methods for creating
traits.

Note:
    For further information on traits, see :mod:`~exa.widget`.

Additionally, :class:`~exa.numerical.DataFrame` and related objects (e.g
:class:`~exa.numerical.Field`) provide attributes for defining their index and
column names. This has the effect of creating relationships between different
dataframes. They can be grouped into three types:

1. index name (df1) matches index name (df2)
2. index name (df1) matches column name (df2)
3. column name (df1) matches column name (df2)

Note:
    These types correspond to one to one, one to many, and many to many relational
    types, respectively.

Finally, the objects contained in this module provide convenience methods for
handling `categorical data`_.

See Also:
    For information about traits and how they allow enable dynamic visualizations
    see :mod:`~exa.widget`. For usage of numerical objects see :mod:`~exa.container`.

.. _categorical data: http://pandas-docs.github.io/pandas-docs-travis/categorical.html
'''
import warnings
import numpy as np
import pandas as pd
from numbers import Integral, Real
from traitlets import Unicode, Integer, Float
from exa.error import RequiredIndexError, RequiredColumnError


if not hasattr(pd.DataFrame, 'memory_usage'):
    def memory_usage(self):
        raise NotImplementedErrror()
    pd.DataFrame.memory_usage = memory_usage


[docs]class Numerical: ''' Base class for :class:`~exa.numerical.Series`, :class:`~exa.numerical.DataFrame`, and :class:`~exa.numerical.Field` objects, providing default trait functionality, shortened string representation, and in memory copying support. '''
[docs] def copy(self, *args, **kwargs): ''' Create a copy without mangling the (class) type. ''' return self.__class__(self._copy(*args, **kwargs))
def _custom_traits(self): return {} def _update_traits(self): traits = self._custom_traits() return traits def __repr__(self): name = self.__class__.__name__ return '{0}{1}'.format(name, self.shape) def __str__(self): return self.__repr__()
[docs]class Series(Numerical, pd.Series): ''' Trait supporting analogue of :class:`~pandas.Series`. .. code-block:: Python import numpy as np class MySeries(Series): """Example usage of the exa.Series object.""" _sname = 'data' _iname = 'data_index' _precision = 2 s = MySeries(np.random.rand(10**5)) traits = s._update_traits() type(traits) # dict containing "myseries_values" as a Unicode trait ''' _copy = pd.Series.copy # These attributes should be set when subclassing Series _sname = None # Series may have a required name _iname = None # Series may have a required index name _stype = None # Series may have a required value type _itype = None # Series may have a required index type _precision = None # Precision for JSON values _index_trait = False # Set to true if the index should be a trait def _update_traits(self): ''' By default, the trait representation is a unicode string of the values. Series traits always have the format: - values: "classnamelowercase_name_values" - index: "classnamelowercase_name_index" ''' traits = self._custom_traits() s = self if isinstance(self.dtype, pd.types.dtypes.CategoricalDtype) and self._stype is not None: s = self.astype(self._stype) prefix = '_'.join((self.__class__.__name__.lower(), self._name)) p = 10 if self._precision is None else self._precision values = s.to_json(orient='values', double_precision=p) up = {prefix + '_values': Unicode(values).tag(sync=True)} if self._index_trait: indices = pd.Series(s.index).to_json(orient='values') up[prefix + '_index'] = Unicode(indices).tag(sync=True) traits.update(up) return traits def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Auto rename depending on class attributes (see above) if self._sname is not None and self.name != self._sname: if self.name is not None: warnings.warn('Series name changed') self.name = self._sname if self._iname is not None and self.index.name != self._iname: if self.index.name is not None: warnings.warn('Series index name changed') self.index.name = self._iname
[docs]class DataFrame(Numerical, pd.DataFrame): '''Trait supporting analogue of :class:`~pandas.DataFrame`.''' _copy = pd.DataFrame.copy _memory_usage = pd.DataFrame.memory_usage _groupbys = [] # Column names by which to group the data _indices = [] # Required index names _columns = [] # Required column entries _traits = [] # Traits present as dataframe columns (or series values) _categories = {} # Column name, original type pairs ('label', int) that can be compressed to a category _precision = {} # Traits precision for JSON {col: prec, col1: prec, ...} def _revert_categories(self): ''' Change all columns of type category to their native type. ''' for column, dtype in self._categories.items(): if column in self.columns: self[column] = self[column].astype(dtype) def _set_categories(self): ''' Change all category like columns from their native type to category type. ''' for column, dtype in self._categories.items(): if column in self.columns: self[column] = self[column].astype('category') def _update_traits(self): ''' Generate trait objects from column data. This function will group columns (if applicable) and form JSON object strings from columns which have been declared as traits (using the _traits attribute). Note: This function decides what `trait type`_ to use. This will almost always be a JSON (unicode) string formatted to be parsed into an array like structure in Javascript. .. _trait type: http://traitlets.readthedocs.org/en/stable/trait_types.html ''' self._revert_categories() traits = self._custom_traits() groups = None prefix = self.__class__.__name__.lower() self._fi = self.index[0] if self._groupbys: groups = self.groupby(self._groupbys) for name in self._traits: trait_name = '_'.join((prefix, str(name))) # Name mangle to ensure uniqueness if name in self.columns: if np.all(np.isclose(self[name], self.ix[self._fi, name])): value = self.ix[self._fi, name] # If all the entries are the same if isinstance(value, Integral): # only send a single entry to JS. trait = Integer(int(value)) elif isinstance(value, Real): trait = Float(float(value)) else: raise TypeError('Unknown type for {0} with type {1}'.format(name, dtype)) elif groups: # If groups exist, make a list of list(s) p = 10 if name in self._precision: p = self._precision[name] trait = Unicode(groups.apply(lambda g: g[name].values).to_json(orient='values', double_precision=p)) else: # Otherwise, just send the flattened values p = 10 if name in self._precision: p = self._precision[name] trait = Unicode(self[name].to_json(orient='values', double_precision=p)) traits[trait_name] = trait.tag(sync=True) elif name == self.index.names[0]: # If not in columns, but is index name, send index trait_name = '_'.join((prefix, str(name))) string = pd.Series(self.index.values).to_json(orient='values') traits[trait_name] = Unicode(string).tag(sync=True) self._set_categories() return traits def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if len(self) > 0: name = self.__class__.__name__ if self._columns: missing = set(self._columns).difference(self.columns) if missing: raise RequiredColumnError(missing, name) if self._indices: missing = set(self._indices).difference(self.index.names) if missing and len(self.index.names) != len(self._indices): raise RequiredIndexError(missing, name) else: self.index.names = self._indices self._set_categories()
[docs]class Field(DataFrame): ''' A field is composed of the field definition ("field_data") and values ( "field_values"). Field data define the shape and discretization of a field. Field values are scalar magnitudes (or vectors) that describe the field at each discretized point in space. ''' _precision = None _vprecision = 10 # values precision (for traits) _indices = ['field']
[docs] def copy(self, *args, **kwargs): ''' Copy the field dataframe, including the field values ''' df = self._copy(*args, **kwargs) field_values = [field.copy() for field in self.field_values] return self.__class__(field_values, df)
def _custom_traits(self): ''' Obtain field values using the custom trait getter (called automatically by :func:`~exa.numerical.Numerical._update_traits`). ''' traits = {} if self._groupbys: grps = self.groupby(self._groupbys) string = str(list(grps.groups.values())).replace(' ', '') traits['field_indices'] = Unicode(string).tag(sync=True) else: string = pd.Series(self.index.values).to_json(orient='values') traits['field_indices'] = Unicode(string).tag(sync=True) label_indices = self.groupby('frame').apply(lambda x: x[['label']]).to_json(orient='values') traits['label_indices'] = Unicode(label_indices).tag(sync=True) s = pd.Series({i: field.values for i, field in enumerate(self.field_values)}) json_string = s.to_json(orient='values', double_precision=self._vprecision) traits['field_values'] = Unicode(json_string).tag(sync=True) return traits
[docs] def memory_usage(self): ''' Get the combined memory usage of the field data and field values. ''' data = self._memory_usage() values = 0 for value in self.field_values: values += value.memory_usage() data['field_values'] = values return data
def __init__(self, *args, field_values=None, **kwargs): if isinstance(args[0], pd.Series): args = (args[0].to_frame().T, ) super().__init__(*args, **kwargs) if isinstance(field_values, (list, tuple, np.ndarray)): self.field_values = [Series(v) for v in field_values] elif field_values is None: self.field_values = [] elif isinstance(field_values, pd.Series): self.field_values = [Series(field_values)] else: raise TypeError('Wrong type for field_values with type {}'.format(type(field_values))) for i in range(len(self.field_values)): self.field_values[i].name = i
[docs]class Field3D(Field): ''' Dataframe for storing dimensions of a scalar or vector field of 3D space. +-------------------+----------+-------------------------------------------+ | Column | Type | Description | +===================+==========+===========================================+ | nx | int | number of grid points in x | +-------------------+----------+-------------------------------------------+ | ny | int | number of grid points in y | +-------------------+----------+-------------------------------------------+ | nz | int | number of grid points in z | +-------------------+----------+-------------------------------------------+ | ox | float | field origin point in x | +-------------------+----------+-------------------------------------------+ | oy | float | field origin point in y | +-------------------+----------+-------------------------------------------+ | oz | float | field origin point in z | +-------------------+----------+-------------------------------------------+ | xi | float | First component in x | +-------------------+----------+-------------------------------------------+ | xj | float | Second component in x | +-------------------+----------+-------------------------------------------+ | xk | float | Third component in x | +-------------------+----------+-------------------------------------------+ | yi | float | First component in y | +-------------------+----------+-------------------------------------------+ | yj | float | Second component in y | +-------------------+----------+-------------------------------------------+ | yk | float | Third component in y | +-------------------+----------+-------------------------------------------+ | zi | float | First component in z | +-------------------+----------+-------------------------------------------+ | zj | float | Second component in z | +-------------------+----------+-------------------------------------------+ | zk | float | Third component in z | +-------------------+----------+-------------------------------------------+ Note: Each field should be flattened into an N x 1 (scalar) or N x 3 (vector) series or dataframe respectively. The orientation of the flattening should have x as the outer loop and z values as the inner loop (for both cases). This is sometimes called C-major order, C-style order, and has the last index changing the fastest and the first index changing the slowest. See Also: :class:`~exa.numerical.Field` ''' _columns = ['nx', 'ny', 'nz', 'ox', 'oy', 'oz', 'xi', 'xj', 'xk', 'yi', 'yj', 'yk', 'zi', 'zj', 'zk'] _traits = ['nx', 'ny', 'nz', 'ox', 'oy', 'oz', 'xi', 'xj', 'xk', 'yi', 'yj', 'yk', 'zi', 'zj', 'zk']
[docs]class SparseSeries(Numerical, pd.SparseSeries): ''' Trait supporting sparse series. ''' _copy = pd.SparseSeries.copy
[docs]class SparseDataFrame(Numerical, pd.SparseDataFrame): ''' Trait supporting sparse dataframe. ''' _copy = pd.SparseDataFrame.copy def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if len(self) > 0: name = self.__class__.__name__ if self._columns: missing = set(self._columns).difference(self.columns) if missing: raise RequiredColumnError(missing, name) if self._indices: missing = set(self._indices).difference(self.index.names) if missing and len(self.index.names) != len(self._indices): raise RequiredIndexError(missing, name) else: self.index.names = self._indices