Source code for seeq.addons.correlation._heatmap

import numpy as np
import pandas as pd
import pickle
import plotly.graph_objs as go
# There is a bug that prevents to correctly memorize a pandas.DataFrame
# Thus, all functions that use the @cached decorator need to accept serialized dataframes (pickle is a good option)
from memoization import cached
from ._config import _cache_max_items
from . import default_preprocessing_wrapper
from . import lags_coeffs


[docs]def heatmap(df, max_time_shift='auto', output_values='coeffs', output_type='plot', time_output_unit='auto', bypass_preprocessing=False): """ Creates a heatmap plot of the cross-correlation coefficients between signals. The signals can be allowed to shift in time to find the maximum cross-correlation between signals. Alternatively, a heatmap of the time shifts to maximize correlation of signals can be plotted as well. Parameters ---------- df: pandas.DataFrame A DataFrame that contains a set of signals as columns and date-time as the index. max_time_shift: {'auto', str, None}, default 'auto' Maximum time (e.g. '15s', or '1min') that the signals are allowed to slide in order to maximize cross-correlation. For times specified as a str, normal time units are accepted.If 'auto' is selected, a default maximum time shift is calculated based on the number of samples. If None, the raw signals are used and no time shifts are calculated. output_values: {'coeffs', 'time_shifts'}, default 'coeffs' Values to plot in the heatmap. Either the Pearson's coefficients or the time shifts that the signals were shifted to maximize cross-correlation. output_type: {'plot', 'table'}, default 'plot' The heatmap can be outputted either as an (interactive) plot or as a DataFrame time_output_unit: {'auto', str} default 'auto' Specifies the time unit used to display the time shifts. Valid units are the ones accepted by pd.Timedelta bypass_preprocessing: bool, default False Whether the data pre-processing routine is by-passed or not. Setting it to True is not recommended unless the data has been pre-processed elsewhere. Returns ------- Either one of the following -: None Displays a Plotly figure with either Pearson's coefficients or signal time shifts table: pandas.DataFrame A DataFrame with either Pearson's coefficients or times_shifts of all signal pairs Examples -------- Create a heatmap plot of the cross-correlation coefficients for the signals in a DataFrame allowing for automatic guess of maximum time shifts >>> seeq.addons.correlation.heatmap(df, >>> max_time_shift='auto', >>> output_values='coeffs', >>> output_type='plot') Create a heatmap plot of the cross-correlation coefficients for the signals in a DataFrame specifying a maximum time shift between signals of 1 hour >>> seeq.addons.correlation.heatmap(df, >>> max_time_shift='1h', >>> output_values='coeffs', >>> output_type='plot') Create a table of the time shifts to maximize cross-correlation of the signals in a DataFrame specifying a maximum time shift between signals of 1 hour >>> seeq.addons.correlation.heatmap(df, >>> max_time_shift='1h', >>> output_values='time_shifts', >>> output_type='table') Create a table of the cross-correlation coefficients for the signals in a DataFrame using the raw data (no time shift allowed) >>> seeq.addons.correlation.heatmap(df, >>> max_time_shift=None, >>> output_values='coeffs', >>> output_type='table') """ if time_output_unit is None: raise ValueError('time_output_unit cannot be None. Please specify a valid pd.Timedelta unit') heatmap_object = _heatmap(df, max_time_shift=max_time_shift, output_values=output_values, output_type=output_type, time_output_unit=time_output_unit, bypass_preprocessing=bypass_preprocessing) if output_type == 'plot': heatmap_object.show(config={'displaylogo': False, 'displayModeBar': True}) else: return heatmap_object
def _heatmap(df, max_time_shift='auto', output_values='coeffs', output_type='plot', time_output_unit='auto', bypass_preprocessing=False): # We don't want to remove outliers here. Increased the outlier_sensitivity df = default_preprocessing_wrapper(df, consecutivenans=0.04, percent_nan=0.0, bypass_processing=bypass_preprocessing) lags, coeffs, sampling_time, time_unit, maxlags = lags_coeffs(df, max_time_shift, time_output_unit) lags_to_time = lags * sampling_time coeffs_df = pd.DataFrame(data=coeffs, columns=df.columns, index=df.columns) time_shifts_df = pd.DataFrame(data=lags_to_time, columns=df.columns, index=df.columns) if output_type == 'plot': if output_values == 'coeffs': fig = _heatmap_plot(pickle.dumps(coeffs_df), pickle.dumps(time_shifts_df), time_unit=time_unit, lags_plot=False) elif output_values == 'time_shifts': fig = _heatmap_plot(pickle.dumps(time_shifts_df), pickle.dumps(coeffs_df), time_unit=time_unit, lags_plot=True) else: raise ValueError('Invalid output_type: {}'.format(output_values)) return fig elif output_type == 'table': if output_values == 'coeffs': return coeffs_df elif output_values == 'time_shifts': time_shifts_df.columns = [f"{x} ({time_unit})" for x in time_shifts_df.columns] return time_shifts_df else: raise ValueError('Invalid output_values: {}'.format(output_values)) else: raise ValueError('Invalid output_values: {}'.format(output_type)) def rename_signals(signal_list, max_label_chars): if np.array([len(x) for x in signal_list]).max() > max_label_chars: new_names = [] for i, name in enumerate(signal_list): truncated_name = name[-max_label_chars:] if truncated_name in new_names: unique_name = f"{truncated_name[2:]}_{i}" else: unique_name = truncated_name new_names.append(unique_name) else: new_names = signal_list return new_names @cached(max_size=_cache_max_items) def _heatmap_plot(primary_df_serialized, secondary_df_serialized, time_unit: str, lags_plot=False, boolean_df=None, max_label_chars=30): primary_df = pickle.loads(primary_df_serialized) secondary_df = pickle.loads(secondary_df_serialized) if primary_df.empty: return go.Figure() signal_list = list(primary_df.columns) new_names = rename_signals(signal_list, max_label_chars) if boolean_df is not None and isinstance(boolean_df, pd.DataFrame): primary_array = primary_df[boolean_df].values else: primary_array = primary_df.values x = signal_list x_names = new_names y = signal_list[::-1] y_names = new_names[::-1] z = np.flipud(primary_array) p_label = np.flipud(primary_df.values) s_label = np.flipud(secondary_df.values) if lags_plot: flat = primary_array.flatten() limit = max(np.nanmax(flat), np.abs(np.nanmin(flat))) title = "Time (" + time_unit + ")" else: limit = 1.0 title = 'Correlation Coefficient' hovertext = list() for yi, yy in enumerate(y): hovertext.append(list()) for xi, xx in enumerate(x): if lags_plot: hovertext[-1].append( f'Shifted signal: {xx}<br>' f'Signal: {yy} <br>' f'<b>Time shifted ({time_unit}): {p_label[yi][xi]:.1f}</b> ' f'<br>Coefficient: {s_label[yi][xi]:.2f}') else: hovertext[-1].append( f'Shifted signal: {xx}<br>Signal: {yy} <br><b>Coefficient: {p_label[yi][xi]:.2f}</b> ' f'<br>Time shifted ({time_unit}): {s_label[yi][xi]:.1f}') colorscale = [[0.0, '#992542'], [0.111, '#C00000'], [0.222, '#FF0000'], [0.333, '#f77e7e'], [0.444, '#ffd1d1'], [0.5, '#D3D3D3'], [0.666, '#9cbbd1'], [0.777, '#6c9ec1'], [0.888, '#4791c6'], [0.999, '#1f7fc4'], [1.0, '#0070C0']] # create the raw corr heatmap data = go.Heatmap(z=z, # needs to be flipped in order for diagonal to have correct orientation in plotly x=x_names, y=y_names, hoverinfo='text', text=hovertext, colorscale=colorscale, colorbar=dict(title=title), zmin=-limit, zmax=limit, name='' ) fig = go.Figure(data=data) fig.layout.paper_bgcolor = 'rgba(0,0,0,0)' fig.layout.plot_bgcolor = 'rgba(0,0,0,0)' fig.layout.dragmode = "select" fig.layout.modebar = { 'bgcolor': 'rgba(0, 0, 0, 0)', 'color': 'rgba(221, 221, 221, 1)', 'activecolor': 'rgba(0, 121, 96, 1)' } # this ensures a square plot fig.layout.xaxis = {'constrain': 'domain', 'scaleanchor': 'y'} fig.layout.yaxis = {'constrain': 'domain'} return fig