# -*- coding: utf-8 -*-
# Copyright (c) 2015-2016, Exa Analytics Development Team
# Distributed under the terms of the Apache License 2.0
'''
Trait Supporting Data Objects
###################################
The :class:`~exa.numerical.DataFrame` is an extension of the
:class:`~pandas.DataFrame` object. It provides additional methods for creating
traits.
Note:
For further information on traits, see :mod:`~exa.widget`.
Additionally, :class:`~exa.numerical.DataFrame` and related objects (e.g
:class:`~exa.numerical.Field`) provide attributes for defining their index and
column names. This has the effect of creating relationships between different
dataframes. They can be grouped into three types:
1. index name (df1) matches index name (df2)
2. index name (df1) matches column name (df2)
3. column name (df1) matches column name (df2)
Note:
These types correspond to one to one, one to many, and many to many relational
types, respectively.
Finally, the objects contained in this module provide convenience methods for
handling `categorical data`_.
See Also:
For information about traits and how they allow enable dynamic visualizations
see :mod:`~exa.widget`. For usage of numerical objects see :mod:`~exa.container`.
.. _categorical data: http://pandas-docs.github.io/pandas-docs-travis/categorical.html
'''
import warnings
import numpy as np
import pandas as pd
from numbers import Integral, Real
from traitlets import Unicode, Integer, Float
from exa.error import RequiredIndexError, RequiredColumnError
if not hasattr(pd.DataFrame, 'memory_usage'):
def memory_usage(self):
raise NotImplementedErrror()
pd.DataFrame.memory_usage = memory_usage
[docs]class Numerical:
'''
Base class for :class:`~exa.numerical.Series`, :class:`~exa.numerical.DataFrame`,
and :class:`~exa.numerical.Field` objects, providing default trait
functionality, shortened string representation, and in memory copying support.
'''
[docs] def copy(self, *args, **kwargs):
'''
Create a copy without mangling the (class) type.
'''
return self.__class__(self._copy(*args, **kwargs))
def _custom_traits(self):
return {}
def _update_traits(self):
traits = self._custom_traits()
return traits
def __repr__(self):
name = self.__class__.__name__
return '{0}{1}'.format(name, self.shape)
def __str__(self):
return self.__repr__()
[docs]class Series(Numerical, pd.Series):
'''
Trait supporting analogue of :class:`~pandas.Series`.
.. code-block:: Python
import numpy as np
class MySeries(Series):
"""Example usage of the exa.Series object."""
_sname = 'data'
_iname = 'data_index'
_precision = 2
s = MySeries(np.random.rand(10**5))
traits = s._update_traits()
type(traits) # dict containing "myseries_values" as a Unicode trait
'''
_copy = pd.Series.copy
# These attributes should be set when subclassing Series
_sname = None # Series may have a required name
_iname = None # Series may have a required index name
_stype = None # Series may have a required value type
_itype = None # Series may have a required index type
_precision = None # Precision for JSON values
_index_trait = False # Set to true if the index should be a trait
def _update_traits(self):
'''
By default, the trait representation is a unicode string of the values.
Series traits always have the format:
- values: "classnamelowercase_name_values"
- index: "classnamelowercase_name_index"
'''
traits = self._custom_traits()
s = self
if isinstance(self.dtype, pd.types.dtypes.CategoricalDtype) and self._stype is not None:
s = self.astype(self._stype)
prefix = '_'.join((self.__class__.__name__.lower(), self._name))
p = 10 if self._precision is None else self._precision
values = s.to_json(orient='values', double_precision=p)
up = {prefix + '_values': Unicode(values).tag(sync=True)}
if self._index_trait:
indices = pd.Series(s.index).to_json(orient='values')
up[prefix + '_index'] = Unicode(indices).tag(sync=True)
traits.update(up)
return traits
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Auto rename depending on class attributes (see above)
if self._sname is not None and self.name != self._sname:
if self.name is not None:
warnings.warn('Series name changed')
self.name = self._sname
if self._iname is not None and self.index.name != self._iname:
if self.index.name is not None:
warnings.warn('Series index name changed')
self.index.name = self._iname
[docs]class DataFrame(Numerical, pd.DataFrame):
'''Trait supporting analogue of :class:`~pandas.DataFrame`.'''
_copy = pd.DataFrame.copy
_memory_usage = pd.DataFrame.memory_usage
_groupbys = [] # Column names by which to group the data
_indices = [] # Required index names
_columns = [] # Required column entries
_traits = [] # Traits present as dataframe columns (or series values)
_categories = {} # Column name, original type pairs ('label', int) that can be compressed to a category
_precision = {} # Traits precision for JSON {col: prec, col1: prec, ...}
def _revert_categories(self):
'''
Change all columns of type category to their native type.
'''
for column, dtype in self._categories.items():
if column in self.columns:
self[column] = self[column].astype(dtype)
def _set_categories(self):
'''
Change all category like columns from their native type to category type.
'''
for column, dtype in self._categories.items():
if column in self.columns:
self[column] = self[column].astype('category')
def _update_traits(self):
'''
Generate trait objects from column data.
This function will group columns (if applicable) and form JSON object strings
from columns which have been declared as traits (using the _traits attribute).
Note:
This function decides what `trait type`_ to use. This will almost always
be a JSON (unicode) string formatted to be parsed into an array like
structure in Javascript.
.. _trait type: http://traitlets.readthedocs.org/en/stable/trait_types.html
'''
self._revert_categories()
traits = self._custom_traits()
groups = None
prefix = self.__class__.__name__.lower()
self._fi = self.index[0]
if self._groupbys:
groups = self.groupby(self._groupbys)
for name in self._traits:
trait_name = '_'.join((prefix, str(name))) # Name mangle to ensure uniqueness
if name in self.columns:
if np.all(np.isclose(self[name], self.ix[self._fi, name])):
value = self.ix[self._fi, name] # If all the entries are the same
if isinstance(value, Integral): # only send a single entry to JS.
trait = Integer(int(value))
elif isinstance(value, Real):
trait = Float(float(value))
else:
raise TypeError('Unknown type for {0} with type {1}'.format(name, dtype))
elif groups: # If groups exist, make a list of list(s)
p = 10
if name in self._precision:
p = self._precision[name]
trait = Unicode(groups.apply(lambda g: g[name].values).to_json(orient='values', double_precision=p))
else: # Otherwise, just send the flattened values
p = 10
if name in self._precision:
p = self._precision[name]
trait = Unicode(self[name].to_json(orient='values', double_precision=p))
traits[trait_name] = trait.tag(sync=True)
elif name == self.index.names[0]: # If not in columns, but is index name, send index
trait_name = '_'.join((prefix, str(name)))
string = pd.Series(self.index.values).to_json(orient='values')
traits[trait_name] = Unicode(string).tag(sync=True)
self._set_categories()
return traits
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if len(self) > 0:
name = self.__class__.__name__
if self._columns:
missing = set(self._columns).difference(self.columns)
if missing:
raise RequiredColumnError(missing, name)
if self._indices:
missing = set(self._indices).difference(self.index.names)
if missing and len(self.index.names) != len(self._indices):
raise RequiredIndexError(missing, name)
else:
self.index.names = self._indices
self._set_categories()
[docs]class Field(DataFrame):
'''
A field is composed of the field definition ("field_data") and values (
"field_values"). Field data define the shape and discretization of a field.
Field values are scalar magnitudes (or vectors) that describe the field at
each discretized point in space.
'''
_precision = None
_vprecision = 10 # values precision (for traits)
_indices = ['field']
[docs] def copy(self, *args, **kwargs):
'''
Copy the field dataframe, including the field values
'''
df = self._copy(*args, **kwargs)
field_values = [field.copy() for field in self.field_values]
return self.__class__(field_values, df)
def _custom_traits(self):
'''
Obtain field values using the custom trait getter (called automatically
by :func:`~exa.numerical.Numerical._update_traits`).
'''
traits = {}
if self._groupbys:
grps = self.groupby(self._groupbys)
string = str(list(grps.groups.values())).replace(' ', '')
traits['field_indices'] = Unicode(string).tag(sync=True)
else:
string = pd.Series(self.index.values).to_json(orient='values')
traits['field_indices'] = Unicode(string).tag(sync=True)
label_indices = self.groupby('frame').apply(lambda x: x[['label']]).to_json(orient='values')
traits['label_indices'] = Unicode(label_indices).tag(sync=True)
s = pd.Series({i: field.values for i, field in enumerate(self.field_values)})
json_string = s.to_json(orient='values', double_precision=self._vprecision)
traits['field_values'] = Unicode(json_string).tag(sync=True)
return traits
[docs] def memory_usage(self):
'''
Get the combined memory usage of the field data and field values.
'''
data = self._memory_usage()
values = 0
for value in self.field_values:
values += value.memory_usage()
data['field_values'] = values
return data
def __init__(self, *args, field_values=None, **kwargs):
if isinstance(args[0], pd.Series):
args = (args[0].to_frame().T, )
super().__init__(*args, **kwargs)
if isinstance(field_values, (list, tuple, np.ndarray)):
self.field_values = [Series(v) for v in field_values]
elif field_values is None:
self.field_values = []
elif isinstance(field_values, pd.Series):
self.field_values = [Series(field_values)]
else:
raise TypeError('Wrong type for field_values with type {}'.format(type(field_values)))
for i in range(len(self.field_values)):
self.field_values[i].name = i
[docs]class Field3D(Field):
'''
Dataframe for storing dimensions of a scalar or vector field of 3D space.
+-------------------+----------+-------------------------------------------+
| Column | Type | Description |
+===================+==========+===========================================+
| nx | int | number of grid points in x |
+-------------------+----------+-------------------------------------------+
| ny | int | number of grid points in y |
+-------------------+----------+-------------------------------------------+
| nz | int | number of grid points in z |
+-------------------+----------+-------------------------------------------+
| ox | float | field origin point in x |
+-------------------+----------+-------------------------------------------+
| oy | float | field origin point in y |
+-------------------+----------+-------------------------------------------+
| oz | float | field origin point in z |
+-------------------+----------+-------------------------------------------+
| xi | float | First component in x |
+-------------------+----------+-------------------------------------------+
| xj | float | Second component in x |
+-------------------+----------+-------------------------------------------+
| xk | float | Third component in x |
+-------------------+----------+-------------------------------------------+
| yi | float | First component in y |
+-------------------+----------+-------------------------------------------+
| yj | float | Second component in y |
+-------------------+----------+-------------------------------------------+
| yk | float | Third component in y |
+-------------------+----------+-------------------------------------------+
| zi | float | First component in z |
+-------------------+----------+-------------------------------------------+
| zj | float | Second component in z |
+-------------------+----------+-------------------------------------------+
| zk | float | Third component in z |
+-------------------+----------+-------------------------------------------+
Note:
Each field should be flattened into an N x 1 (scalar) or N x 3 (vector)
series or dataframe respectively. The orientation of the flattening
should have x as the outer loop and z values as the inner loop (for both
cases). This is sometimes called C-major order, C-style order, and has
the last index changing the fastest and the first index changing the
slowest.
See Also:
:class:`~exa.numerical.Field`
'''
_columns = ['nx', 'ny', 'nz', 'ox', 'oy', 'oz', 'xi', 'xj', 'xk',
'yi', 'yj', 'yk', 'zi', 'zj', 'zk']
_traits = ['nx', 'ny', 'nz', 'ox', 'oy', 'oz', 'xi', 'xj', 'xk',
'yi', 'yj', 'yk', 'zi', 'zj', 'zk']
[docs]class SparseSeries(Numerical, pd.SparseSeries):
'''
Trait supporting sparse series.
'''
_copy = pd.SparseSeries.copy
[docs]class SparseDataFrame(Numerical, pd.SparseDataFrame):
'''
Trait supporting sparse dataframe.
'''
_copy = pd.SparseDataFrame.copy
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if len(self) > 0:
name = self.__class__.__name__
if self._columns:
missing = set(self._columns).difference(self.columns)
if missing:
raise RequiredColumnError(missing, name)
if self._indices:
missing = set(self._indices).difference(self.index.names)
if missing and len(self.index.names) != len(self._indices):
raise RequiredIndexError(missing, name)
else:
self.index.names = self._indices