Source code for exa.container

# -*- coding: utf-8 -*-
# Copyright (c) 2015-2016, Exa Analytics Development Team
# Distributed under the terms of the Apache License 2.0
'''
Container
########################
The :class:`~exa.container.BaseContainer` class is the primary object for
data processing, analysis, and visualization. Containers are composed of
n-dimensional spreadsheet-like (see :mod:`~exa.numerical`) objects whose
columns contain data for 2D and 3D visualization.

The :class:`~exa.container.BaseContainer` is akin to a :class:`~pandas.HDFStore`
in that it is a container for dataframes (and saves to an HDF5 file). It is
different in that it provides visualization tools access to the data contained
via automated JSON strings, transferrable between languages.

See Also:
    :mod:`~exa.relational.container` and :mod:`~exa.widget`
'''
import os
import numpy as np
import pandas as pd
import networkx as nx
from sys import getsizeof
from copy import deepcopy
from traitlets import Bool
from exa import mpl
from exa._config import config
from exa.widget import ContainerWidget
from exa.numerical import Series, DataFrame, SparseSeries, SparseDataFrame, Field
from exa.relational import ContainerFile, scoped_session
from exa.utility import convert_bytes


# These constants are used for data network visualization
edge_colors = mpl.sns.color_palette('viridis', 2)
edge_types = ['index-index', 'index-column']
edge_color_map = dict(zip(edge_types, edge_colors))
r_edge_color_map = {v: k for k, v in edge_color_map.items()}
node_colors = mpl.sns.color_palette('viridis', 7)
node_types = [Field, SparseSeries, DataFrame, SparseDataFrame, Series, pd.DataFrame, pd.Series]
node_color_map = list(zip(node_types, node_colors))
r_node_color_map = {v: '.'.join((k.__module__, k.__name__)) for k, v in node_color_map}


[docs]class Container: ''' Container class responsible for all features related to data management. ''' _widget_class = ContainerWidget _getter_prefix = 'compute' _cardinal_axis = None
[docs] def copy(self, **kwargs): ''' Create a copy of the current object. ''' cls = self.__class__ kws = self._rel(copy=True) dfs = self._data(copy=True) kws.update(dfs) # We updates kws in this order because we kws.update(kwargs) # want to respect the user's kwargs. return cls(**kws)
[docs] def concat(self, *args, **kwargs): ''' Concatenate any number of container objects with the current object into a single container object. See Also: For argument description, see :func:`~exa.container.concat`. ''' raise NotImplementedError()
[docs] def slice_by_indices(self, key): ''' Slice the container by series or dataframe index. Warning: Does not make a copy, must call the **.copy()** method on the resulting container if a copy is needed. ''' if isinstance(key, (int, np.int32, np.int64)): key = [key] kwargs = {} for name, data in self._data().items(): k = name[1:] if name.startswith('_') else name if isinstance(data, Field): d = data.ix[key] i = d.index.values v = data.field_values[i] kwargs[k] = data.__class__(d, field_values=v) else: kwargs[k] = data.ix[key] return self.__class__(name=self.name, description=self.description, meta=self.meta, **kwargs)
[docs] def slice_by_cardinal_axis(self, key): ''' Slice the container according to its cardinal axis. See Also: Note the warning in :func:`~exa.container.Container.slice_by_indices`. ''' if isinstance(key, (int, np.int32, np.int64)): key = [key] elif isinstance(key, slice): key = self[self._cardinal_axis].index.values[key] kwargs = {} for name, data in self._data().items(): k = name[1:] if name.startswith('_') else name if self._cardinal_axis in data.index.names: kwargs[k] = data.ix[key] elif self._cardinal_axis in data.columns: kwargs[k] = data[data[self._cardinal_axis].isin(key)] else: kwargs[k] = data return self.__class__(name=self.name, description=self.description, meta=self.meta, **kwargs)
[docs] def info(self): ''' Display information about the container's objects. Note: Sizes are reported in bytes. ''' names = [] types = [] sizes = [] names.append('WIDGET') types.append('-') s = 0 if self._widget is not None: for obj in self._widget._trait_values.values(): s += getsizeof(obj) sizes.append(s) names.append('METADATA') types.append('-') s = 0 for obj in self._rel().values(): s += getsizeof(obj) sizes.append(s) for name, obj in self._data().items(): names.append(name[1:] if name.startswith('_') else name) types.append('.'.join((obj.__module__, obj.__class__.__name__))) if isinstance(obj, pd.Series): sizes.append(obj.memory_usage()) else: sizes.append(obj.memory_usage().sum()) inf = pd.DataFrame.from_dict({'object': names, 'type': types, 'size': sizes}) inf.set_index('object', inplace=True) return inf.sort_index()
[docs] def memory_usage(self): ''' Estimate the memory usage of the entire container. ''' n = getsizeof(self) return ' '.join((str(s) for s in convert_bytes(n)))
[docs] def network(self): ''' Display information about the container's object relationships. Note: Due to quirks of plotting, rerunning this command until a "pleasing" visual is generated may be useful. ''' def get_color(obj): '''Gets the color of a node based on the node's data type.''' for k, v in node_color_map: if isinstance(obj, k): return v return 'gray' def legend(items, mapper, title, loc, ax): '''Legend creation helper''' proxies = [] descriptions = [] for k in set(items): if title == 'Data Type': line = mpl.sns.mpl.lines.Line2D([], [], linestyle='none', color=k, marker='o') else: line = mpl.sns.mpl.lines.Line2D([], [], linestyle='-', color=k) proxies.append(line) descriptions.append(mapper[k]) leg = ax.legend(proxies, descriptions, title=title, loc=loc, frameon=True) leg_frame = leg.get_frame() leg_frame.set_facecolor('white') leg_frame.set_edgecolor('black') return leg, ax inf = self.info() inf = inf[inf['type'] != '-'] nodes = inf.index.values node_sizes = inf['size'] node_sizes *= 13000/node_sizes.max() node_sizes += 2000 node_colors = {} node_types = {} edges = {} items = self._data().items() for k0, v0 in items: n0 = k0[1:] if k0.startswith('_') else k0 node_colors[n0] = get_color(v0) node_types[n0] = '.'.join((v0.__class__.__module__, v0.__class__.__name__)) for k1, v1 in items: if v0 is v1: continue n1 = k1[1:] if k1.startswith('_') else k1 for name in v0.index.names: if name is None: continue if name in v1.index.names: edges[(n0, n1)] = edge_color_map['index-index'] edges[(n1, n0)] = edge_color_map['index-index'] for col in v1.columns: if name in col and '_' not in col: # Catches things like index name == 'index', column name == 'index0' edges[(n0, n1)] = edge_color_map['index-column'] edges[(n1, n0)] = edge_color_map['index-column'] g = nx.Graph() g.add_nodes_from(nodes) g.add_edges_from(edges.keys()) node_size = [node_sizes[k] for k in g.nodes()] node_color = [node_colors[k] for k in g.nodes()] edge_color = [edges[k] for k in g.edges()] labels = {k: ' {}\n({})'.format(k, node_types[k]) for k in g.nodes()} fig, ax = mpl.sns.plt.subplots(1, figsize=(14, 9), dpi=300) ax.axis('off') pos = nx.spring_layout(g) f0 = nx.draw_networkx_nodes(g, pos=pos, ax=ax, alpha=0.7, node_size=node_size, node_color=node_color) f1 = nx.draw_networkx_labels(g, pos=pos, labels=labels, font_size=17, font_weight='bold', ax=ax) f2 = nx.draw_networkx_edges(g, pos=pos, edge_color=edge_color, width=2, ax=ax) l1, ax = legend(edge_color, r_edge_color_map, 'Connection', (1, 0), ax) l2, ax = legend(node_color, r_node_color_map, 'Data Type', (1, 0.3), ax) fig.gca().add_artist(l1)
[docs] def save(self, path): ''' Save the container as an HDF5 archive. Args: path (str): Path where to save the container ''' # First save the file record with scoped_session() as session: cfile = ContainerFile(name=self.name, description=self.description, size=getsizeof(self)) session.add(cfile) # Second save the data if path is None: path = self.hexuid + '.hdf5' elif os.path.isdir(path): path += os.sep + self.hexuid + '.hdf5' elif not (path.endswith('.hdf5') or path.endswith('.hdf')): raise ValueError('File path must have a ".hdf5" or ".hdf" extension.') with pd.HDFStore(path, 'w') as store: store['kwargs'] = pd.Series() store.get_storer('kwargs').attrs.metadata = self._rel() fc = 0 # Field counter (see special handling of fields below) for name, data in self._data().items(): if hasattr(data, '_revert_categories'): data._revert_categories() name = name[1:] if name.startswith('_') else name if isinstance(data, Field): # Fields are handled separately fname = 'FIELD{}_'.format(fc) + name + '/' store[fname + 'data'] = pd.DataFrame(data) for i, field in enumerate(data.field_values): ffname = fname + 'values' + str(i) if isinstance(field, pd.Series): store[ffname] = pd.Series(field) else: store[ffname] = pd.DataFrame(field) fc += 1 elif isinstance(data, Series): s = pd.Series(data) if isinstance(data.dtype, pd.types.dtypes.CategoricalDtype): s = s.astype('O') store[name] = s elif isinstance(data, DataFrame): store[name] = pd.DataFrame(data) elif isinstance(data, SparseSeries): s = pd.SparseSeries(data) if isinstance(data.dtype, pd.types.dtypes.CategoricalDtype): s = s.astype('O') store[name] = s elif isinstance(data, SparseDataFrame): store[name] = pd.SparseDataFrame(data) else: if hasattr(data, 'dtype') and isinstance(data.dtype, pd.types.dtypes.CategoricalDtype): data = data.astype('O') else: for col in data: if isinstance(data[col].dtype, pd.types.dtypes.CategoricalDtype): data[col] = data[col].astype('O') store[name] = data if hasattr(data, '_set_categories'): data._set_categories()
@classmethod
[docs] def load(cls, pkid_or_path=None): ''' Load a container object from a persistent location or file path. Args: pkid_or_path: Integer pkid corresponding to the container table or file path Returns: container: The saved container object ''' path = pkid_or_path if not os.path.isfile(pkid_or_path): raise NotImplementedError('Cannot lookup automatic path yet..') kwargs = {} with pd.HDFStore(path) as store: for key in store.keys(): if 'kwargs' in key: kwargs.update(store.get_storer(key).attrs.metadata) else: name = str(key[1:]) kwargs[name] = store[key] # Process any fields n = [int(key.split('_')[0].replace('FIELD', '')) for key in kwargs.keys() if 'FIELD' in key] if len(n) != 0: n = max(n) to_del = [] for i in range(n + 1): search = 'FIELD' + str(i) names = [key for key in kwargs.keys() if search in key] to_del += names arg = names[0].replace(search + '_', '').split('/')[0] field_values = [kwargs[key] for key in names if 'values' in key] dkey = None for name in names: if 'data' in name: dkey = name field_data = kwargs[dkey] kwargs[arg] = field_data kwargs[arg + '_values'] = field_values for name in to_del: del kwargs[name] return cls(**kwargs)
def _rel(self, copy=False): ''' Get all (propagatable) relational and metadata data of the container ( primary keys are not propagatable). ''' rel = {} for key, obj in self.__dict__.items(): if not isinstance(obj, (pd.Series, pd.DataFrame)) and not key.startswith('_'): if copy and 'id' not in key: rel[key] = deepcopy(obj) else: rel[key] = obj return rel def _data(self, copy=False): ''' Get all data associated with the container as key value pairs. ''' data = {} for key, obj in self.__dict__.items(): if isinstance(obj, (pd.Series, pd.DataFrame, pd.SparseSeries, pd.SparseDataFrame)): if copy: data[key] = obj.copy() else: data[key] = obj return data def _custom_traits(self): ''' Placeholder for custom container traits (e.g. traits that are comprised of data present in multiple data objects). ''' return {} def _update_traits(self): ''' Jupyter notebook widgets require data to be available within a :class:`~exa.widget.Widget` object. This allows notebook extensions (nbextensions - written in JavaScript) to access backend (Python) data via `ipywidgets`_. .. _ipywidgets: https://ipywidgets.readthedocs.io/en/latest/ ''' if self._widget is not None: # If a corresponding widget exists, build traits if len(self._data()) == 0: traits = {'test': Bool(True).tag(sync=True)} else: traits = self._custom_traits() traits['test'] = Bool(False).tag(sync=True) traits.update(self._custom_traits()) for n, obj in self._data().items(): if (hasattr(obj, '_traits') or isinstance(obj, (Series, SparseSeries))) and len(obj) > 0: traits.update(obj._update_traits()) self._widget.add_traits(**traits) # Adding traits to the widget makes self._traits_need_update = False # them accesible from nbextensions (JavaScript). def __delitem__(self, key): if key in self.__dict__: del self.__dict__[key] def __sizeof__(self): '''Note that this function must return a Python integer.''' return int(self.info()['size'].sum()) def __getitem__(self, key): if isinstance(key, str): return getattr(self, key) elif isinstance(key, (int, slice, list)) and self._cardinal_axis is None: return self.slice_by_indices(key) elif isinstance(key, (int, slice, list)) and self._cardinal_axis is not None: return self.slice_by_cardinal_axis(key) raise KeyError() def __init__(self, name=None, description=None, meta=None, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) self.name = name self.description = description self.meta = {} if meta is None else meta self._traits_need_update = True # This will create an instance of the widget class (if present) self._widget = self._widget_class(self) if config['dynamic']['notebook'] == 'true' else None def _repr_html_(self): if self._widget is not None and self._traits_need_update: self._update_traits() return self._widget._repr_html_()
[docs]class TypedMeta(type): ''' This metaclass creates statically typed class attributes using the property framework. .. code-block:: Python class TestMeta(TypedMeta): attr1 = (int, float) attr2 = DataFrame class TestClass(metaclass=TestMeta): def __init__(self, attr1, attr2): self.attr1 = attr1 self.attr2 = attr2 The above code dynamically creates code that looks like the following: .. code-block:: Python class TestClass: @property def attr1(self): return self._attr1 @attr1.setter def attr1(self, obj): if not isinstance(obj, (int, float)): raise TypeError('attr1 must be int') self._attr1 = obj @attr1.deleter def attr1(self): del self._attr1 @property def attr2(self): return self._attr2 @attr2.setter def attr2(self, obj): if not isinstance(obj, DataFrame): raise TypeError('attr2 must be DataFrame') self._attr2 = obj @attr2.deleter def attr2(self): del self._attr2 def __init__(self, attr1, attr2): self.attr1 = attr1 self.attr2 = attr2 ''' @staticmethod
[docs] def create_property(name, ptype): ''' Creates a custom property with a getter that performs computing functionality (if available) and raise a type error if setting with the wrong type. Note: By default, the setter attempts to convert the object to the correct type; a type error is raised if this fails. ''' pname = '_' + name # This will be where the data is store (e.g. self._name) # This is the default property "getter" for container data objects. # If the property value is None, this function will check for a # convenience method with the signature, self.compute_name() and call # it prior to returning the property value. def getter(self): if not hasattr(self, pname) and hasattr(self, '{}{}'.format(self._getter_prefix, pname)): self['{}{}'.format(self._getter_prefix, pname)]() if not hasattr(self, pname): raise AttributeError('Please compute or set {} first.'.format(name)) return getattr(self, pname) def setter(self, obj): # This is the default property "setter" for container data objects. # Prior to setting a property value, this function checks that the # object's type is correct. if not isinstance(obj, ptype): try: obj = ptype(obj) except Exception: raise TypeError('Must be able to convert object {0} to {1} (or must be of type {1})'.format(name, ptype)) setattr(self, pname, obj) def deleter(self): # Deletes the property's value. del self[pname] return property(getter, setter, deleter)
def __new__(metacls, name, bases, clsdict): ''' Modification of the class definition occurs here; we iterate over all statically typed attributes and attach their property (see :func:`~exa.container.TypedMeta.create_property`) definition, returning the new class definition. ''' for k, v in metacls.__dict__.items(): if isinstance(v, type) and k[0] != '_': clsdict[k] = metacls.create_property(k, v) return super().__new__(metacls, name, bases, clsdict)