Source code for matplotlib_helpers.chart

from collections import OrderedDict
import copy
import datetime as dt
import itertools

from sklearn import linear_model
import matplotlib as mpl
import numpy as np
import pandas as pd


time_total_seconds = lambda t: (60 * 60 * t.hour + 60 * t.minute + t.second
                                + 1e-6 * t.microsecond)


[docs]def time_safe(series):
    return (series.map(time_total_seconds)
            if isinstance(series.iloc[0], dt.time)
            else series)


[docs]def data_groups(df, group_key, data_key):
    return (df.set_index(group_key)[data_key].map(time_total_seconds)
            .groupby(level=0)
            if isinstance(df[data_key].iloc[0], dt.time)
            else df.groupby(group_key)[data_key])


[docs]def unique_by_column(df):
    '''
    Parameters
    ==========
    df  : pandas.DataFrame
        Data frame.

    Returns
    =======
    pandas.Series
        Mapping from each column label to ordered list of unique values in
        corresponding column in data frame.
    '''
    return pd.Series([sorted(df[column].unique()) for column in df.columns],
                     index=df.columns)


[docs]def groupif(df, key):
    if not isinstance(key, list):
        singleton_key = True
        key = [key]
    else:
        singleton_key = False

    key = pd.Series(key)

    if all([k is None for k in key]):
        if singleton_key:
            yield 0, df
        else:
            yield tuple([0] * key.size), df
    else:
        for key_i, df_i in df.groupby(key[~key.isnull()].tolist()):
            if not isinstance(key_i, (list, tuple)):
                key_i = [key_i]
            full_key_i = pd.Series(object(), index=key.index)
            full_key_i[key.isnull()] = 0
            full_key_i[~key.isnull()] = key_i
            if singleton_key:
                yield full_key_i.values[0], df_i
            else:
                yield tuple(full_key_i.values), df_i


[docs]def encode(df_data, **kwargs):
    '''
    Parameters
    ==========
    x : str
        Label of column containing ``x``-dimension.
    y  : str
        Label of column containing ``y``-dimension.
    row  : str, optional
        Label of column containing row categories.  If ``None``, all data is
        plotted in a single row of plots.
    column  : str, optional
        Label of column containing column categories. If ``None``, all data is
        plotted in a single column of plots.
    color  : str, optional
        Label of column containing color categories. If ``None``, all data is
        plotted in the same color.
    shape  : str, optional
        Label of column containing shape categories. If ``None``, all data is
        plotted using the same marker shape.
    style  : str, optional
        Label of column containing style categories. If ``None``, all data is
        plotted using the same line style.
    sharexscale  : bool or 'column', optional
        If ``True`` (default) all subplots share the same scale on the ``x``
        axis. If ``'column'`` all subplots *in the same column* share the same
        ``x`` axis.  If ``False``, the ``x`` axis of each subplot is scaled
        independently.
    shareyscale  : bool or 'row', optional
        If ``True`` (default) all subplots share the same scale on the ``y``
        axis. If ``'row'`` all subplots *in the same row* share the same ``y``
        axis.  If ``False``, the ``y`` axis of each subplot is scaled
        independently.
    fill : bool, optional
        Fill markers
    stroke : bool, optional
        Draw marker outlines
    linestyle : str, optional
        Line style to use for plot.

        By default, if :data:`shape` is set, :data:`linestyle` is set to
        ``"none"``.  If :data:`shape` is not set, :data:`linestyle` is set to
        ``"--"`` by default.

    Returns
    -------
    (fig, axes, keys, values)
        The ``matplotlib`` figure (``fig``), a nested dictionary (``axes``)
        indexed by row key then by column key, a ``pandas.Series`` (``keys``)
        mapping each categorical argument name to the corresponding column
        label, a ``pandas.Series`` (``values``) mapping each categorical
        argument name to a corresponding list of unique category values.
    '''
    categorical = 'row', 'column', 'color', 'shape', 'style'

    # Get column label/key associated with each category (e.g., row, column).
    keys = pd.Series([kwargs.get(k) for k in categorical], index=categorical)

    # For each category (e.g., row, column) get ordered list of unique values.
    df = df_data[keys[~keys.isnull()].unique().tolist()]
    unique_by_column_i = unique_by_column(df)
    values = pd.Series([unique_by_column_i.get(keys.get(category_i))
                        for category_i in categorical], index=categorical)

    # Find row, column, x and y range limits.
    descriptions = pd.Series()
    if keys.row is None:
        descriptions['row'] = None
    else:
        df_i = df_data
        if kwargs.get('logy'):
            df_i = df_data.loc[df_data[kwargs['y']] > 0]
        groups = data_groups(df_i, keys.row, kwargs['y'])
        descriptions['row'] = groups.describe()

    if keys.column is None:
        descriptions['column'] = None
    else:
        df_i = df_data
        if kwargs.get('logx'):
            df_i = df_data.loc[df_data[kwargs['x']] > 0]
        groups = data_groups(df_i, keys.column, kwargs['x'])
        descriptions['column'] = groups.describe()

    for axis_type_i in 'xy':
        series_i = time_safe(df_data[kwargs[axis_type_i]])
        if kwargs.get('log' + axis_type_i):
            series_i = series_i[series_i > 0]
        descriptions[axis_type_i] = series_i.describe()

    counts = values.map(lambda v: 1 if v is None else len(v))

    # extra column for legend
    grid = mpl.gridspec.GridSpec(counts.row, counts.column + 1)

    cell_size = kwargs.get('cell_size', 3)
    cell_width = kwargs.get('cell_width', cell_size)
    cell_height = kwargs.get('cell_height', cell_size)

    fig = mpl.pyplot.figure(figsize=(cell_width * (counts.column + 1),
                                     cell_height * counts.row))

    axes = OrderedDict([(row_i,
                         OrderedDict([(column_j, fig.add_subplot(grid[i, j]))
                                      for j, column_j in
                                      enumerate([0] if values.column is None
                                                else values.column)]))
                        for i, row_i in enumerate([0] if values.row is None
                                                  else values.row)])

    axis = axes.values()[0].values()[0]
    colors = OrderedDict(zip(values.color or [0],
                             itertools.imap(lambda v: v['color'],
                                            axis._get_lines.prop_cycler)))
    filled_markers = set(mpl.markers.MarkerStyle.filled_markers)
    nonfilled_markers = reversed(filter(lambda v: (v is not None) and (v !=
                                                                       'None')
                                        and (v not in filled_markers),
                                        mpl.markers.MarkerStyle.markers
                                        .keys()))
    markers = OrderedDict(zip(values['shape'] or [0],
                              itertools.cycle(itertools
                                              .chain(mpl.markers.MarkerStyle
                                                     .filled_markers,
                                                     nonfilled_markers))))

    result = {}
    if 'regression' in kwargs:
        result['regression_model'] = {}

    for (row_i, column_i), df_i in groupif(df_data, [keys.row, keys.column]):
        axis_ij = axes[row_i][column_i]
        for (color_j, shape_j), df_j in groupif(df_i, [keys.color,
                                                       keys['shape']]):
            if 'regression' in kwargs:
                model = kwargs['regression'].get('model',
                                                 linear_model
                                                 .LinearRegression())
                N = kwargs['regression'].get('N', 10)

                # Plot regression.
                X = time_safe(df_j[kwargs['x']]).values[:, np.newaxis]
                y = time_safe(df_j[kwargs['y']]).values

                if X.shape[0] > 1:
                    model.fit(X, y)

                    X_fit = np.linspace(X.ravel().min(), X.ravel().max(),
                                        N).reshape(-1, 1)
                    y_fit = model.predict(X_fit)

                    axis_ij.plot(X_fit, y_fit, color=colors[color_j],
                                 linestyle='--')
                    regress_row = (result['regression_model']
                                   .setdefault(row_i, {}))
                    regress_column = regress_row.setdefault(column_i, {})
                    regress_color = regress_column.setdefault(color_j, {})
                    regress_leaf = regress_color
                    regress_leaf[shape_j] = copy.deepcopy(model)

            plot_kwargs = {'color': colors[color_j]}
            if keys['shape'] is not None:
                plot_kwargs['markeredgecolor'] = (colors[color_j]
                                                  if kwargs.get('stroke', True)
                                                  else 'none')
                plot_kwargs['markerfacecolor'] = (colors[color_j]
                                                  if kwargs.get('fill', True)
                                                  else 'none')
                plot_kwargs['marker'] = markers[shape_j]
                plot_kwargs['linestyle'] = kwargs.get('linestyle', 'none')
            else:
                plot_kwargs['linestyle'] = kwargs.get('linestyle', '--')
            # Plot markers.
            axis_ij.plot(df_j[kwargs['x']].values, df_j[kwargs['y']].values,
                         **plot_kwargs)
        axis_ij.set_xlabel(kwargs['x'])
        axis_ij.set_ylabel(kwargs['y'])

    sharexscale = kwargs.get('sharexscale', True)
    shareyscale = kwargs.get('shareyscale', True)

    for row_i, column_i, axis_i in [(row, column, v)
                                    for row, d in axes.iteritems()
                                    for column, v in d.iteritems()]:
        if kwargs.get('logx'):
            axis_i.set_xscale('log')
        if kwargs.get('logy'):
            axis_i.set_yscale('log')
        for tick in axis_i.get_xticklabels():
            tick.set_rotation(90)

        if sharexscale:
            if sharexscale == 'column':
                xmin, xmax = descriptions.column[column_i][['min', 'max']]
            else:
                xmin, xmax = descriptions.x[['min', 'max']]
            if kwargs.get('logx'):
                axis_i.set_xlim((10 ** np.floor(np.log10(xmin))),
                                (10 ** np.ceil(np.log10(xmax))))
            else:
                xspan = xmax - xmin
                axis_i.set_xlim(xmin - .05 * xspan, xmax + .05 * xspan)
        if shareyscale:
            if shareyscale == 'row':
                ymin, ymax = descriptions.row[row_i][['min', 'max']]
            else:
                ymin, ymax = descriptions.y[['min', 'max']]
            if kwargs.get('logy'):
                axis_i.set_ylim((10 ** np.floor(np.log10(ymin))),
                                (10 ** np.ceil(np.log10(ymax))))
            else:
                yspan = ymax - ymin
                axis_i.set_ylim(ymin - .05 * yspan, ymax + .05 * yspan)

    if values.column is not None:
        for i, column_i in enumerate(values.column):
            row_j = 0 if values.row is None else values.row[0]
            axis = axes[row_j][column_i]
            axis.set_title(column_i)

            if values.row is not None:
                for row_j in values.row[:-1]:
                    if sharexscale:
                        axis = axes[row_j][column_i]
                        axis.set_xlabel('')
                        axis.set_xticklabels([])

    if values.row is not None:
        for i, row_i in enumerate(values.row):
            column_j = 0 if values.column is None else values.column[0]
            axis = axes[row_i][column_j]
            axis.set_ylabel(row_i)

            if values.column is not None:
                for column_j in values.column[1:]:
                    if shareyscale:
                        axis = axes[row_i][column_j]
                        axis.set_ylabel('')
                        axis.set_yticklabels([])

    axis_legend = fig.add_subplot(grid[:, -1])

    legend_symbols = []
    legend_labels = []

    if keys['shape']:
        legend_symbols += [mpl.pyplot.Line2D([0], [0], linestyle='')]
        legend_symbols += [mpl.pyplot.Line2D([0], [0], linestyle='',
                                             marker=markers[k],
                                             color=colors[k]
                                             if keys['shape'] == keys.color
                                             else 'black')
                           for k in markers.keys()]
        legend_labels += [keys['shape']]
        legend_labels += map(lambda k: str(k).split('T')[0],
                             markers.keys())

    if keys['color'] and (keys['shape'] != keys.color):
        if keys['shape']:
            legend_symbols += [mpl.pyplot.Line2D([0], [0], linestyle='')]
            legend_labels += ['']
        legend_symbols += [mpl.pyplot.Line2D([0], [0], linestyle='')]
        legend_symbols += [mpl.pyplot.Line2D([0], [0], linestyle='',
                                             marker='s', color=colors[k])
                           for k in colors.keys()]
        legend_labels += [keys['color']]
        legend_labels += map(lambda k: str(k).split('T')[0],
                             colors.keys())

    axis_legend.legend(legend_symbols, legend_labels, loc='upper left')
    axis_legend.set_axis_off()

    fig.tight_layout()

    result.update({'fig': fig, 'axes': axes, 'keys': keys, 'values': values})
    return result


[docs]class Chart(object):
    def __init__(self, df):
        self.df = df

[docs]    def encode(self, **kwargs):
        return encode(self.df, **kwargs)