Source code for matplotlib_helpers.chart

from collections import OrderedDict
import copy
import datetime as dt
import itertools

from sklearn import linear_model
import matplotlib as mpl
import numpy as np
import pandas as pd


time_total_seconds = lambda t: (60 * 60 * t.hour + 60 * t.minute + t.second
                                + 1e-6 * t.microsecond)


[docs]def time_safe(series): return (series.map(time_total_seconds) if isinstance(series.iloc[0], dt.time) else series)
[docs]def data_groups(df, group_key, data_key): return (df.set_index(group_key)[data_key].map(time_total_seconds) .groupby(level=0) if isinstance(df[data_key].iloc[0], dt.time) else df.groupby(group_key)[data_key])
[docs]def unique_by_column(df): ''' Parameters ========== df : pandas.DataFrame Data frame. Returns ======= pandas.Series Mapping from each column label to ordered list of unique values in corresponding column in data frame. ''' return pd.Series([sorted(df[column].unique()) for column in df.columns], index=df.columns)
[docs]def groupif(df, key): if not isinstance(key, list): singleton_key = True key = [key] else: singleton_key = False key = pd.Series(key) if all([k is None for k in key]): if singleton_key: yield 0, df else: yield tuple([0] * key.size), df else: for key_i, df_i in df.groupby(key[~key.isnull()].tolist()): if not isinstance(key_i, (list, tuple)): key_i = [key_i] full_key_i = pd.Series(object(), index=key.index) full_key_i[key.isnull()] = 0 full_key_i[~key.isnull()] = key_i if singleton_key: yield full_key_i.values[0], df_i else: yield tuple(full_key_i.values), df_i
[docs]def encode(df_data, **kwargs): ''' Parameters ========== x : str Label of column containing ``x``-dimension. y : str Label of column containing ``y``-dimension. row : str, optional Label of column containing row categories. If ``None``, all data is plotted in a single row of plots. column : str, optional Label of column containing column categories. If ``None``, all data is plotted in a single column of plots. color : str, optional Label of column containing color categories. If ``None``, all data is plotted in the same color. shape : str, optional Label of column containing shape categories. If ``None``, all data is plotted using the same marker shape. style : str, optional Label of column containing style categories. If ``None``, all data is plotted using the same line style. sharexscale : bool or 'column', optional If ``True`` (default) all subplots share the same scale on the ``x`` axis. If ``'column'`` all subplots *in the same column* share the same ``x`` axis. If ``False``, the ``x`` axis of each subplot is scaled independently. shareyscale : bool or 'row', optional If ``True`` (default) all subplots share the same scale on the ``y`` axis. If ``'row'`` all subplots *in the same row* share the same ``y`` axis. If ``False``, the ``y`` axis of each subplot is scaled independently. fill : bool, optional Fill markers stroke : bool, optional Draw marker outlines linestyle : str, optional Line style to use for plot. By default, if :data:`shape` is set, :data:`linestyle` is set to ``"none"``. If :data:`shape` is not set, :data:`linestyle` is set to ``"--"`` by default. Returns ------- (fig, axes, keys, values) The ``matplotlib`` figure (``fig``), a nested dictionary (``axes``) indexed by row key then by column key, a ``pandas.Series`` (``keys``) mapping each categorical argument name to the corresponding column label, a ``pandas.Series`` (``values``) mapping each categorical argument name to a corresponding list of unique category values. ''' categorical = 'row', 'column', 'color', 'shape', 'style' # Get column label/key associated with each category (e.g., row, column). keys = pd.Series([kwargs.get(k) for k in categorical], index=categorical) # For each category (e.g., row, column) get ordered list of unique values. df = df_data[keys[~keys.isnull()].unique().tolist()] unique_by_column_i = unique_by_column(df) values = pd.Series([unique_by_column_i.get(keys.get(category_i)) for category_i in categorical], index=categorical) # Find row, column, x and y range limits. descriptions = pd.Series() if keys.row is None: descriptions['row'] = None else: df_i = df_data if kwargs.get('logy'): df_i = df_data.loc[df_data[kwargs['y']] > 0] groups = data_groups(df_i, keys.row, kwargs['y']) descriptions['row'] = groups.describe() if keys.column is None: descriptions['column'] = None else: df_i = df_data if kwargs.get('logx'): df_i = df_data.loc[df_data[kwargs['x']] > 0] groups = data_groups(df_i, keys.column, kwargs['x']) descriptions['column'] = groups.describe() for axis_type_i in 'xy': series_i = time_safe(df_data[kwargs[axis_type_i]]) if kwargs.get('log' + axis_type_i): series_i = series_i[series_i > 0] descriptions[axis_type_i] = series_i.describe() counts = values.map(lambda v: 1 if v is None else len(v)) # extra column for legend grid = mpl.gridspec.GridSpec(counts.row, counts.column + 1) cell_size = kwargs.get('cell_size', 3) cell_width = kwargs.get('cell_width', cell_size) cell_height = kwargs.get('cell_height', cell_size) fig = mpl.pyplot.figure(figsize=(cell_width * (counts.column + 1), cell_height * counts.row)) axes = OrderedDict([(row_i, OrderedDict([(column_j, fig.add_subplot(grid[i, j])) for j, column_j in enumerate([0] if values.column is None else values.column)])) for i, row_i in enumerate([0] if values.row is None else values.row)]) axis = axes.values()[0].values()[0] colors = OrderedDict(zip(values.color or [0], itertools.imap(lambda v: v['color'], axis._get_lines.prop_cycler))) filled_markers = set(mpl.markers.MarkerStyle.filled_markers) nonfilled_markers = reversed(filter(lambda v: (v is not None) and (v != 'None') and (v not in filled_markers), mpl.markers.MarkerStyle.markers .keys())) markers = OrderedDict(zip(values['shape'] or [0], itertools.cycle(itertools .chain(mpl.markers.MarkerStyle .filled_markers, nonfilled_markers)))) result = {} if 'regression' in kwargs: result['regression_model'] = {} for (row_i, column_i), df_i in groupif(df_data, [keys.row, keys.column]): axis_ij = axes[row_i][column_i] for (color_j, shape_j), df_j in groupif(df_i, [keys.color, keys['shape']]): if 'regression' in kwargs: model = kwargs['regression'].get('model', linear_model .LinearRegression()) N = kwargs['regression'].get('N', 10) # Plot regression. X = time_safe(df_j[kwargs['x']]).values[:, np.newaxis] y = time_safe(df_j[kwargs['y']]).values if X.shape[0] > 1: model.fit(X, y) X_fit = np.linspace(X.ravel().min(), X.ravel().max(), N).reshape(-1, 1) y_fit = model.predict(X_fit) axis_ij.plot(X_fit, y_fit, color=colors[color_j], linestyle='--') regress_row = (result['regression_model'] .setdefault(row_i, {})) regress_column = regress_row.setdefault(column_i, {}) regress_color = regress_column.setdefault(color_j, {}) regress_leaf = regress_color regress_leaf[shape_j] = copy.deepcopy(model) plot_kwargs = {'color': colors[color_j]} if keys['shape'] is not None: plot_kwargs['markeredgecolor'] = (colors[color_j] if kwargs.get('stroke', True) else 'none') plot_kwargs['markerfacecolor'] = (colors[color_j] if kwargs.get('fill', True) else 'none') plot_kwargs['marker'] = markers[shape_j] plot_kwargs['linestyle'] = kwargs.get('linestyle', 'none') else: plot_kwargs['linestyle'] = kwargs.get('linestyle', '--') # Plot markers. axis_ij.plot(df_j[kwargs['x']].values, df_j[kwargs['y']].values, **plot_kwargs) axis_ij.set_xlabel(kwargs['x']) axis_ij.set_ylabel(kwargs['y']) sharexscale = kwargs.get('sharexscale', True) shareyscale = kwargs.get('shareyscale', True) for row_i, column_i, axis_i in [(row, column, v) for row, d in axes.iteritems() for column, v in d.iteritems()]: if kwargs.get('logx'): axis_i.set_xscale('log') if kwargs.get('logy'): axis_i.set_yscale('log') for tick in axis_i.get_xticklabels(): tick.set_rotation(90) if sharexscale: if sharexscale == 'column': xmin, xmax = descriptions.column[column_i][['min', 'max']] else: xmin, xmax = descriptions.x[['min', 'max']] if kwargs.get('logx'): axis_i.set_xlim((10 ** np.floor(np.log10(xmin))), (10 ** np.ceil(np.log10(xmax)))) else: xspan = xmax - xmin axis_i.set_xlim(xmin - .05 * xspan, xmax + .05 * xspan) if shareyscale: if shareyscale == 'row': ymin, ymax = descriptions.row[row_i][['min', 'max']] else: ymin, ymax = descriptions.y[['min', 'max']] if kwargs.get('logy'): axis_i.set_ylim((10 ** np.floor(np.log10(ymin))), (10 ** np.ceil(np.log10(ymax)))) else: yspan = ymax - ymin axis_i.set_ylim(ymin - .05 * yspan, ymax + .05 * yspan) if values.column is not None: for i, column_i in enumerate(values.column): row_j = 0 if values.row is None else values.row[0] axis = axes[row_j][column_i] axis.set_title(column_i) if values.row is not None: for row_j in values.row[:-1]: if sharexscale: axis = axes[row_j][column_i] axis.set_xlabel('') axis.set_xticklabels([]) if values.row is not None: for i, row_i in enumerate(values.row): column_j = 0 if values.column is None else values.column[0] axis = axes[row_i][column_j] axis.set_ylabel(row_i) if values.column is not None: for column_j in values.column[1:]: if shareyscale: axis = axes[row_i][column_j] axis.set_ylabel('') axis.set_yticklabels([]) axis_legend = fig.add_subplot(grid[:, -1]) legend_symbols = [] legend_labels = [] if keys['shape']: legend_symbols += [mpl.pyplot.Line2D([0], [0], linestyle='')] legend_symbols += [mpl.pyplot.Line2D([0], [0], linestyle='', marker=markers[k], color=colors[k] if keys['shape'] == keys.color else 'black') for k in markers.keys()] legend_labels += [keys['shape']] legend_labels += map(lambda k: str(k).split('T')[0], markers.keys()) if keys['color'] and (keys['shape'] != keys.color): if keys['shape']: legend_symbols += [mpl.pyplot.Line2D([0], [0], linestyle='')] legend_labels += [''] legend_symbols += [mpl.pyplot.Line2D([0], [0], linestyle='')] legend_symbols += [mpl.pyplot.Line2D([0], [0], linestyle='', marker='s', color=colors[k]) for k in colors.keys()] legend_labels += [keys['color']] legend_labels += map(lambda k: str(k).split('T')[0], colors.keys()) axis_legend.legend(legend_symbols, legend_labels, loc='upper left') axis_legend.set_axis_off() fig.tight_layout() result.update({'fig': fig, 'axes': axes, 'keys': keys, 'values': values}) return result
[docs]class Chart(object): def __init__(self, df): self.df = df
[docs] def encode(self, **kwargs): return encode(self.df, **kwargs)