Source code for seeq.addons.mps._mps

import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
import pickle
import os
from seeq import spy

import mass_ts as mts
from dtaidistance import dtw


[docs]def gather_workbook_worksheet_meta_data(workbook_id, worksheet_id):
    """
    This function gathers workbook object data and worksheet index

    Parameters
    ----------
    workbook_id : str
        The Seeq ID of the source workbook
    worksheet_id: str
        The Seeq ID of the source worksheet

    Returns
    -------
    desired_workbook: list of seeq.spy.workbooks._workbook objects
        and seeq spy workbook meta data
    sheet_index: int
        integer detailing the index of the source worksheet
    """

    wb_id = spy.workbooks.search({'ID': workbook_id},
                                 quiet=True
                                 )

    desired_workbook = spy.workbooks.pull(wb_id,
                                          include_referenced_workbooks=False, include_inventory=False,
                                          quiet=True, errors='catalog'
                                          )

    # find worksheet index
    try:
        sheet_list = desired_workbook[0].worksheets
        sheet_index = [i for i, s in enumerate(sheet_list) if worksheet_id in str(s)][0]

        return desired_workbook, sheet_index

    except IndexError:
        print('ERROR = Could not find worksheet: ' + str(worksheet_id))


[docs]def save_ref(workbook_id, worksheet_id, signal_pull_list, known_cap, time_frame, grid, save_name, mypath):
    """
    This function saves the reference profile time series data and metadata as a pickle file

    Parameters
    ----------
    workbook_id : str
        The Seeq ID of the source workbook
    worksheet_id: str
        The Seeq ID of the source worksheet
    signal_pull_list: list of str
        List of strings that detail the signal names to describe the reference profile to be saved
    known_cap: str
        Name (str) of the capsule that defines the reference/s to be saved
    time_frame: list of datetime
        Start and end datetimes of the analysis range to searched for the known capsule in the seeq.spy.pull
    grid: str
        resolution/griding of the seeq.spy.pull
    save_name: str
        name of the pickle file to saved as
    mypath
        path to folder to save the pickle file in
    """

    desired_workbook, sheet_index = gather_workbook_worksheet_meta_data(workbook_id, worksheet_id)

    # define signals to be searched and pulled
    items = desired_workbook[0].worksheets[sheet_index].display_items
    items_s = items[items.Type == 'Signal']
    items_s = items_s[items_s['Name'].isin(signal_pull_list)]

    # define 'known' condition to be pulled for all signals
    items_c = items[items.Type == 'Condition']
    items_c = items_c[items_c.Name == known_cap]
    data_pull_known = spy.pull(items_c, start=time_frame[0], end=time_frame[1], quiet=True, grid=grid)

    file_name = os.path.join(str(mypath), str(save_name) + '.pkl')

    with open(file_name, 'wb') as file_:
        pickle.dump(items_s, file_)
        pickle.dump(data_pull_known, file_)


[docs]def load_ref(load_name, mypath):
    """
    This function loads the reference profile time series data and metadata from a previously saved pickle file.

    Parameters
    ----------
    load_name: str
        Name of the pickle file to load.
    mypath: str
        Path to folder containing the pickle file to load.

    Returns
    -------
    items_s_ref: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of ['Name', 'ID', 'Type', 'Color', 'Line Style', 'Line Width',
        'Lane', 'Samples Display', 'Axis Auto Scale', 'Axis Align', 'Axis Group', 'Axis Show'] to detail the signals
        that describe the known reference profile.
    data_pull_known: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of
        ['Condition', 'Capsule Start','Capsule End','Capsule Is Uncertain']
        to detail the capsule of the known reference profile capsule.

    """

    file_name = os.path.join(str(mypath), str(load_name) + '.pkl')

    with open(file_name, 'rb') as file_:
        items_s_ref = pickle.load(file_)
        data_pull_known = pickle.load(file_)

    return items_s_ref, data_pull_known


[docs]def pull_ref_data(items_s_ref, data_pull_known, grid):
    """
    This function gathers the time series data of the reference. The reference conditions limits the timeframe of the
    time series data to be gathered, a signal list instructs the variables to be gathered and the sampling rate or
    griding is set.

    Parameters
    ----------
    items_s_ref: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of
        ['Name', 'ID', 'Type', 'Color', 'Line Style', 'Line Width', 'Lane',
        'Samples Display', 'Axis Auto Scale', 'Axis Align', 'Axis Group',
        'Axis Show']. To detail the signals that describe the known reference profile.
    data_pull_known: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of
        ['Condition', 'Capsule Start','Capsule End','Capsule Is Uncertain'].
        To detail the capsule of the known reference profile capsule.
    grid: str
        Resolution/griding of the spy pull.

    Returns
    -------
    data_pull_c: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the reference profile.
    """

    start_all = data_pull_known['Capsule Start'].iloc[0]
    end_all = data_pull_known['Capsule End'].iloc[-1]

    data_pull_c = spy.pull(items_s_ref, start=start_all, end=end_all, quiet=True, grid=grid)
    data_pull_c['Date-Time'] = pd.to_datetime(data_pull_c.index)
    data_pull_c = data_pull_c.dropna()

    return data_pull_c


[docs]def pull_mps_data(workbook_id, worksheet_id, signal_pull_list, items_s_ref, data_pull_known, time_frame, grid):
    """
    This function gathers all the time series data required for the analysis, for the reference and the search area.

    Parameters
    ----------
    workbook_id : str
        The Seeq ID of the source workbook.
    worksheet_id: str
        The Seeq ID of the source worksheet.
    signal_pull_list: list of str
        List of strings that detail the signal names to describe the reference profile.
    items_s_ref: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of ['Name', 'ID', 'Type', 'Color', 'Line Style', 'Line Width',
        'Lane', 'Samples Display', 'Axis Auto Scale', 'Axis Align', 'Axis Group', 'Axis Show']. To detail the signals
        that describe the reference profile.
    data_pull_known: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of
        ['Condition', 'Capsule Start','Capsule End','Capsule Is Uncertain'].
        to detail the capsule of the known reference profile capsule.
    time_frame: list of datetime
        Start and end of the analysis range to search for the known capsule in the spy pull.
    grid: str
        Details the resolution/griding of the spy pull

    Returns
    -------
    data_pull: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the analysis/search area.known_select
    data_pull_c: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the reference profile.
    sheet_index: int
        Integer detailing the index of the source worksheet.

    """

    desired_workbook, sheet_index = gather_workbook_worksheet_meta_data(workbook_id, worksheet_id)

    # define signals to be searched and pull in 1min grid
    items = desired_workbook[0].worksheets[sheet_index].display_items
    items_s = items[items.Type == 'Signal']
    items_s = items_s[items_s['Name'].isin(signal_pull_list)]

    data_pull = spy.pull(items_s, start=time_frame[0], end=time_frame[1], quiet=True, grid=grid)
    data_pull['Date-Time'] = pd.to_datetime(data_pull.index)
    data_pull = data_pull.dropna()

    data_pull_c = pull_ref_data(items_s_ref, data_pull_known, grid)

    # return signal data, known capsule start and send time, length of known window
    return data_pull, data_pull_c, sheet_index


[docs]def sort_and_prepare_results(data_pull, total_dist, window_step, threshold, known, sim_, max_):
    """
    This function takes the distance measurements from all variables at each window step, orders them numerically,
    computes a % similarity score by comparing against the minimum possible distance (zero) and the maximum distance
    (largest of the inverse of its self or maximum found distance) and removes overlapping 'found' capsules/events from
    most similar to least similar.

    Parameters
    ----------
    data_pull: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time']. This dataframe has all the time series data for the analysis/search area.
    total_dist: list of float
        List of distance measurements between the two curves.
    window_step: int
        ize of the window stepping used by the algo.
    known: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has strictly only the time series data for the reference profile start and end.
    sim_: bool
        Set to return similar or dis-similar results.
    threshold: float
        0 to 1 float to set similarity cutoff for found capsules.
    max_: float
        Max accumulative distance used to scale all other distances measured.
        
    Returns
    -------
    found_list: list of int
        Each int in the list is the sorted (highest similarity match 1st) index of each found capsule
        as an integer relative to data_pull.
    found_sim: list of float
        Corresponding similarity measurement (0 to 1) of found list with 100% being a perfect match.
    """

    # suppress warnings of "A value is trying to be set on a copy of a slice from a DataFrame."
    pd.options.mode.chained_assignment = None  # default='warn'
    found_list = []
    found_sim = []
    # create dataframe of results to remove sort, remove out of threshold set and remove duplicates
    total_dist_df = pd.DataFrame(total_dist, columns=['distance'])
    # adjust if window steps are used
    total_dist_df.index = total_dist_df.index * window_step
    total_dist_df['index_'] = total_dist_df.index

    # short by lowest distance (best match) and get index of lowest to then be used to find datetime
    if known.index[0] >= data_pull.index[0] and known.index[-1] <= data_pull.index[-1]:
        known_index = data_pull.index.get_loc(known.index[0])
        # remove known capsule
        total_dist_df = total_dist_df[(total_dist_df['index_'] > (known_index + known.shape[0])) |
                                      (total_dist_df['index_'] < (known_index - known.shape[0] * 0.5))
                                      ]

    # sort
    total_dist_df_sorted = total_dist_df.sort_values('distance', ascending=sim_)

    # min/max normalise
    min_ = 0
    if isinstance(max_, float):
        total_dist_df_sorted.distance.replace(np.inf, max_, inplace=True)
    else:
        total_dist_df_sorted.distance.replace(np.inf, 0, inplace=True)
        max_ = total_dist_df_sorted.distance.max()

    total_dist_df_sorted.distance = (total_dist_df_sorted.distance - min_) / (max_ - min_)
    total_dist_df_sorted.distance.where(total_dist_df_sorted.distance <= 1, 1, inplace=True)

    # remove out of threshold set
    if threshold > 0:
        total_dist_df_sorted_threshold_filtered = total_dist_df_sorted[total_dist_df_sorted.distance < (1 - threshold)]
    else:
        total_dist_df_sorted_threshold_filtered = total_dist_df_sorted

    for e in range(0, min(1000, total_dist_df_sorted_threshold_filtered.shape[0])):
        if e < total_dist_df_sorted_threshold_filtered.shape[0]:
            item = total_dist_df_sorted_threshold_filtered.index_.iloc[e]
            sim_save = total_dist_df_sorted_threshold_filtered.distance.iloc[e]

            min_window = item - known.shape[0]
            max_window = item + known.shape[0]
            total_dist_df_sorted_threshold_filtered = total_dist_df_sorted_threshold_filtered[
                (total_dist_df_sorted_threshold_filtered.index_ == item) |
                (total_dist_df_sorted_threshold_filtered.index_ > max_window) |
                (total_dist_df_sorted_threshold_filtered.index_ < min_window)
                ]
        else:
            break
        if known.index[0] >= data_pull.index[0] and known.index[-1] <= data_pull.index[-1]:
            known_index = data_pull.index.get_loc(known.index[0])
            # remove known capsule from found profiles and set as list to use for push capsules
            if (item < (known_index - known.shape[0] * 0.8)) or (item > (known_index + known.shape[0])):
                found_list.append(item)
                found_sim.append((1 - sim_save) * 100)
        else:
            found_list.append(item)
            found_sim.append((1 - sim_save) * 100)

    return found_list, found_sim


[docs]def known_select(data_pull_c, data_pull_known, select_):
    """
    This function uses the known start and end time of the reference capsule/s and extracts the time series data from
    the entire time series data set within the investigation range of the worksheet.

    Parameters
    ----------
    data_pull_c: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the reference profile.
    data_pull_known: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the analysis/search area.
    select_: str
        Name of the selected reference capsule.
    Returns
    -------
    known: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has strictly only the time series data for the reference profile start and end.
    knownlength: int
        Length of the known dataframe.
    """
    try:
        known = data_pull_c[data_pull_c['Date-Time'] > data_pull_known['Capsule Start'][select_]]
        known = known[known['Date-Time'] < data_pull_known['Capsule End'][select_]]
        knownlength = known.shape[0]
    except:
        temp_df = data_pull_known
        temp_df['Capsule Start'] = pd.to_datetime(temp_df['Capsule Start'])
        temp_df['Capsule End'] = pd.to_datetime(temp_df['Capsule End'])

        known = data_pull_c[data_pull_c['Date-Time'] > temp_df['Capsule Start'][select_]]
        known = known[known['Date-Time'] < temp_df['Capsule End'][select_]]
        knownlength = known.shape[0]

    return known, knownlength


[docs]def seeq_mps_mass(data_pull, data_pull_c, data_pull_known, threshold, normalise, sim_):
    """
    This function measures the euclidean distance between the reference and search space time series data over the same
    duration (limited by the reference) using Mueen's Algorithm for Similarity Search [MASS]. The algorithm window steps
    through the time series data calculating a distance for each time period. It loops through all the variables
    in the dataset and sums the distances into an accumulative distance measurement for each time step. The function can
    be instructed to normalise the data before measurement. The distance scores are then converted to % similarity
    compared to minimum possible distance (zero) and the maximum distance (largest of the inverse of its self or maximum
    found distance). Finally the % of each variable/signal contribution to the similarity measurement is calculated.
    This function is intended to be applied to continuous process data.

    Technical reference found @ https://www.cs.unm.edu/~mueen/FastestSimilaritySearch.html

    Parameters
    ----------
    data_pull: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the analysis/search area.
    data_pull_c: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the reference profile.
    data_pull_known: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of
        ['Condition', 'Capsule Start','Capsule End','Capsule Is Uncertain'].
        To detail the capsule of the known reference profile capsule.
    threshold: float
        0 to 1 float to set similarity cutoff for found capsules.
    normalise: bool
        Set normalisation of the input data to the algo.
    sim_: bool
        Set to return similar or dis-similar results.

    Returns
    -------
    found_all_sorted: numpy.ndarray
        numpy array of three columns
        1st = found capsules similarity measurement (float 0 to 1)
        2nd = integer index of each found capsule relative to data_pull
        3rd = integer describing the duration/length of the found capsule (each integer is defined by griding in
        'pull_mps_data')

    """

    found_list_overall = []
    found_sim_overall = []
    found_length_overall = []
    found_cap_known = []
    var_columns = []

    for var_1 in data_pull.columns[:-1]:
        for cap_ in range(data_pull_known.shape[0]):
            var_columns.append(str(var_1) + '_' + str(cap_))

    # loop on the known capsules
    for capsule_ in range(data_pull_known.shape[0]):
        max_ = 0
        known, knownlength = known_select(data_pull_c, data_pull_known, capsule_)

        # no window step therefore = 1
        window_step = 1
        for var in data_pull.columns[:-1]:
            chosen_var_list = data_pull.columns[:-1]
            qry = known[var].to_numpy()

            # capture max distance
            max_v = mts.mass(qry, qry * -1, normalize_query=normalise)
            if not isinstance(max_v, float):
                max_v = mts.mass(qry, np.zeros(len(qry)), normalize_query=normalise)
                if not isinstance(max_v, float):
                    if normalise:
                        max_v = float(len(qry))
                    else:
                        max_v = float(np.max(qry) * len(qry))

            max_ += max_v

            ts = data_pull[var].to_numpy()
            new_dist = (mts.mass(ts, qry, normalize_query=normalise))
            # size for addition
            if var == chosen_var_list[0]:
                total_dist = np.zeros(new_dist.shape[0])
                if capsule_ == 0:
                    meta_data = pd.DataFrame(index=[*range(len(new_dist))], columns=var_columns)
            # Totalise distances to find best multi variate match
            if np.inf not in new_dist and True not in np.isnan(new_dist):
                total_dist += new_dist

            meta_data[str(var) + '_' + str(capsule_)] = pd.Series(new_dist)

            found_cap_known.append(capsule_)

        # sort distances and prepare results to index to og data set
        found_list, found_sim = sort_and_prepare_results(data_pull, total_dist, window_step,
                                                         threshold, known, sim_, max_)

        found_length = [knownlength for x in range(len(found_list))]
        capsule_listgen = [capsule_ for x in range(len(found_list))]
        found_list_overall.extend(found_list)
        found_sim_overall.extend(found_sim)
        found_length_overall.extend(found_length)
        found_cap_known.extend(capsule_listgen)

    if known.index[0] >= data_pull.index[0] and known.index[-1] <= data_pull.index[-1]:
        # known add in to filter out
        found_list_overall.extend([data_pull.index.get_loc(known.index[0])])
        found_sim_overall.extend([1000])
        found_length_overall.extend([knownlength])

    found_all_sorted = sorted(zip(found_sim_overall, found_list_overall, found_length_overall,
                                  found_cap_known, found_cap_known), reverse=True)
    found_all_sorted = np.asarray(found_all_sorted)
    found_all_sorted = pd.DataFrame(found_all_sorted)

    for e in range(0, min(1000, found_all_sorted.shape[0])):
        if e < found_all_sorted.shape[0]:
            item = found_all_sorted.iloc[e, 1]
            shape = found_all_sorted.iloc[e, 2]

            min_window = item - shape
            max_window = item + shape

            found_all_sorted = found_all_sorted[
                (found_all_sorted[1] == item) |
                (found_all_sorted[1] > max_window) |
                (found_all_sorted[1] < min_window)
                ]

        else:
            break

    found_all_sorted = found_all_sorted[found_all_sorted.iloc[:, 0] != 1000]

    for var in data_pull.columns[:-1]:
        found_all_sorted[len(found_all_sorted.columns)] = 0
        for i in found_all_sorted.index:
            loc_c = str(var) + '_' + str(found_all_sorted[3][i])[0]
            loc_i = found_all_sorted[1][i]
            if meta_data[loc_c][loc_i] < 99999999999999999:
                found_all_sorted[len(found_all_sorted.columns) - 1][i] = meta_data[loc_c][loc_i]

    found_all_sorted['sum'] = found_all_sorted.loc[:, 5:].sum(axis=1)
    if found_all_sorted.shape[0] == 0:
        return found_all_sorted
    else:
        found_all_sorted.loc[:, 5:] = found_all_sorted.loc[:, 5:].div(found_all_sorted["sum"], axis=0) * 100
        found_all_sorted = found_all_sorted.drop("sum", 1)
        found_all_sorted = found_all_sorted.to_numpy()

        return found_all_sorted


[docs]def seeq_mps_dtw(data_pull, data_pull_c, data_pull_known, threshold, normalise, sim_, time_distort):
    """
    This function measures the distance between the reference and search space time series data over one or many window
    size/durations. This is achieved by utilising the dynamic time warping algorithm, which searches for the smallest
    distance from each data point to a corresponding reference data point within an assigned search window. This
    function window steps through the search area calculating a distance for each time period. It loops through all the
    variables in the dataset and sums the distances into an accumulative distance measurement for each step. The
    function can be instructed to normalise the data before measurement. Then the distance scores are converted to %
    similarity compared to minimum possible distance (zero) and the maximum distance (largest of the inverse of its self
    or maximum found distance). Finally the % of each variable/signal contribution to the similarity measurement is
    calculated. This function is intended to be applied to continuous process data. Technical reference
    https://www.cs.unm.edu/~mueen/DTW.pdf

    Parameters
    ----------
    data_pull: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the analysis/search area.
    data_pull_c: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the reference profile.
    data_pull_known: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of
        ['Condition', 'Capsule Start','Capsule End','Capsule Is Uncertain'].
        To detail the capsule of the known reference profile capsule.
    threshold: float
        0 to 1 float to set similarity cutoff for found capsules.
    normalise: bool
        Set normalisation of the input data to the algo.
    sim_: bool
        Set to return similar or dis-similar results.
    time_distort: float
        0 to 1 float to set % of time distortion of the searching window length in the window stepping of the algo.

    Returns
    -------
    found_all_sorted: numpy.ndarray
        numpy array of three columns
        1st = found capsules similarity measurement (float 0 to 1)
        2nd = integer index of each found capsule relative to data_pull
        3rd = integer describing the duration/length of the found capsule (each integer is defined by griding in
        'pull_mps_data')

    """

    time_distort = int(time_distort * 100)
    found_list_overall = []
    found_sim_overall = []
    found_length_overall = []
    found_cap_known = []
    var_columns = []
    found_time_dist_overall = []

    # loop on the known capsules
    for capsule_ in range(data_pull_known.shape[0]):
        max_ = 0
        known, knownlength = known_select(data_pull_c, data_pull_known, capsule_)

        for time_distort_ in range(0, 1 + time_distort, 1):
            search_size_stretch = int(known.shape[0] * (time_distort_ / 100))
            # size of steps taken during dtw distance measurement across the dataset
            window_step = int(round(len(known) * 0.05))
            if window_step <= 0:
                window_step = 1
            for var in data_pull.columns[:-1]:
                chosen_var_list = data_pull.columns[:-1]
                qry = known[var].to_numpy()

                # capture max distance
                # normalise data
                if normalise:
                    qry = stats.zscore(qry)
                    if True in np.isnan(qry):
                        qry = np.zeros(len(qry))

                max_ += dtw.distance_fast(qry, (-1 * qry), window=round(known.shape[0] * (10 / 100)))

                ts = data_pull[var].to_numpy()
                dist = []
                # normalise data
                if normalise:
                    qry = stats.zscore(qry)
                    ts = stats.zscore(ts)
                    if True in np.isnan(qry):
                        qry = np.zeros(len(qry))
                    if True in np.isnan(ts):
                        ts = np.zeros(len(ts))
                # loop over window size of known length
                for i in range(0, (ts.shape[0] - known.shape[0]), window_step):
                    if i - search_size_stretch < 0:
                        temp_start = i
                    else:
                        temp_start = i - search_size_stretch
                    target = ts[temp_start:i + known.shape[0] + search_size_stretch]
                    distance = dtw.distance_fast(qry, target, window=round(known.shape[0] * (10 / 100)))
                    dist.append(distance)

                # size for cost function addition from zero
                if var == chosen_var_list[0]:
                    total_dist = np.zeros(len(dist))
                    if capsule_ == 0 and time_distort_ == 0:
                        meta_data = pd.DataFrame(index=[*range(len(dist))], columns=var_columns)

                # Totalise distances to find best multi variate match
                if np.inf not in dist and True not in np.isnan(dist):
                    total_dist += dist

                meta_data[str(var) + '_' + str(capsule_) + str(time_distort_)] = pd.Series(dist)

            # sort distances and prepare results to index to og data set
            found_list, found_sim = sort_and_prepare_results(data_pull, total_dist, window_step, threshold,
                                                             known, sim_, max_)
            found_length = [knownlength + (2 * search_size_stretch) for x in range(len(found_list))]

            found_list_overall.extend(found_list)
            found_sim_overall.extend(found_sim)
            found_length_overall.extend(found_length)
            time_distort_listgen = [time_distort_ for x in range(len(found_list))]
            found_time_dist_overall.extend(time_distort_listgen)
            capsule_listgen = [capsule_ for x in range(len(found_list))]
            found_cap_known.extend(capsule_listgen)

            if known.index[0] >= data_pull.index[0] and known.index[-1] <= data_pull.index[-1]:
                # known add in to filter out
                found_list_overall.extend([data_pull.index.get_loc(known.index[0])])
                found_sim_overall.extend([1000])
                found_length_overall.extend([knownlength])

    found_all_sorted = sorted(zip(found_sim_overall, found_list_overall, found_length_overall,
                                  found_cap_known, found_time_dist_overall), reverse=True)
    found_all_sorted = np.asarray(found_all_sorted)
    found_all_sorted = pd.DataFrame(found_all_sorted)

    for e in range(0, min(1000, found_all_sorted.shape[0])):
        if e < found_all_sorted.shape[0]:
            item = found_all_sorted.iloc[e, 1]
            # shape = found_all_sorted.iloc[e,2]

            min_window = item - knownlength
            max_window = item + knownlength

            found_all_sorted = found_all_sorted[
                (found_all_sorted.iloc[:, 1] == item) |
                (found_all_sorted.iloc[:, 1] > max_window) |
                (found_all_sorted.iloc[:, 1] < min_window)
                ]
        else:
            break

    found_all_sorted = found_all_sorted[found_all_sorted.iloc[:, 0] != 1000]

    for var in data_pull.columns[:-1]:
        found_all_sorted[len(found_all_sorted.columns)] = 0
        for i in found_all_sorted.index:
            loc_c = str(var) + '_' + str(found_all_sorted[3][i])[0] + str(found_all_sorted[4][i])[0]
            loc_i = int(found_all_sorted[1][i])
            found_all_sorted[len(found_all_sorted.columns) - 1][i] = meta_data[loc_c][int(loc_i / window_step)]

    found_all_sorted['sum'] = found_all_sorted.loc[:, 5:].sum(axis=1)
    if found_all_sorted.shape[0] == 0:
        return found_all_sorted
    else:

        found_all_sorted.loc[:, 5:] = found_all_sorted.loc[:, 5:].div(found_all_sorted["sum"], axis=0) * 100
        found_all_sorted = found_all_sorted.drop("sum", 1)
        found_all_sorted = found_all_sorted.drop_duplicates(subset=[1])
        found_all_sorted = found_all_sorted.to_numpy()

        return found_all_sorted


[docs]def seeq_mps_dtw_batch(batch_cond, data_pull, data_pull_c, data_pull_known, normalise, time_distort):
    """
    This function is similar to the "seeq_mps_dtw" function but instead of window stepping through the entire search
    area it only measures the distance between two time series datasets for each capsule in the batch condition. This
    function is intended to be applied to batch process data.

    Parameters
    ----------
    batch_cond: pd.DataFrame, pd.Series
        pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" conditions
        requested to be analysed with index of (x, ..., Y) and columns of at least Capsule Start, Capsule End.
        This dataframe has all the data for the batches capsules required.
    data_pull: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the analysis/search area
    data_pull_c: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the reference profile.
    data_pull_known: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of
        ['Condition', 'Capsule Start','Capsule End','Capsule Is Uncertain'].
        To detail the capsule of the known reference profile capsule.
    normalise: bool
        Set normalisation of the input data to the algo.
    time_distort: float
        0 to 1 float to set % of time distortion of the searching window length in the window stepping of the algo.

    Returns
    -------
    batch_sim_df: pd.DataFrame
        A dataframe or series that minimally has columns of
        ['Similarity', 'Date-Time'].
        This has the resulting similarity measure of each defined capsule with the centered datetime

    """

    time_distort = int(time_distort * 100)
    meta_data = pd.DataFrame(index=batch_cond.index, columns=data_pull.columns[:-1])
    # loop on the known capsules
    for capsule_ in range(data_pull_known.shape[0]):
        max_ = 0
        known, knownlength = known_select(data_pull_c, data_pull_known, capsule_)

        for var in data_pull.columns[:-1]:
            chosen_var_list = data_pull.columns[:-1]

            qry = known[var].to_numpy()

            # capture max distance
            if normalise:
                try:
                    if (qry.max() - qry.min()) == 0:
                        pass
                    else:
                        qry = (qry - qry.min()) / (qry.max() - qry.min())
                except:
                    pass
            max_ += dtw.distance_fast(qry, (-1 * qry))

            dist = []

            for b in batch_cond.index:
                ts = data_pull[data_pull['Date-Time'] > batch_cond['Capsule Start'][b]]
                ts = ts[ts['Date-Time'] < batch_cond['Capsule End'][b]]
                # normalise data if level doesnt matter, only shape
                ts = ts[var].to_numpy()

                if ts.shape[0] < int(qry.shape[0] * 0.25):
                    print("Input capsule as too few data points, please remove capsule starting at " +
                          str(batch_cond['Capsule Start'][b]))
                    pass
                else:
                    if normalise:
                        if (qry.max() - qry.min()) == 0 or (ts.max() - ts.min()) == 0:
                            pass
                        else:
                            qry = (qry - qry.min()) / (qry.max() - qry.min())
                            ts = (ts - ts.min()) / (ts.max() - ts.min())

                    distance = dtw.distance_fast(qry, ts, window=int(len(qry) * (2 * time_distort / 100)))
                    meta_data[var][b] = distance
                    dist.append(distance)

            # size for cost function addition from zero
            if var == chosen_var_list[0]:
                total_dist = np.zeros(len(dist))

            # Totalise distances to find best multi variate match
            if np.inf not in dist and True not in np.isnan(dist):
                total_dist += dist

        total_dist_df = pd.DataFrame(total_dist, columns=['distance'])

        min_ = 0

        total_dist_df.distance.replace(np.inf, max_, inplace=True)
        total_dist_df.distance = (total_dist_df.distance - min_) / (max_ - min_)
        total_dist_df.distance.where(total_dist_df.distance <= 1, 1, inplace=True)

        total_dist = np.asarray(total_dist_df)

        if capsule_ == 0:
            found_sim_overall = total_dist
        else:
            for i in range(len(found_sim_overall)):
                if total_dist[i] < found_sim_overall[i]:
                    found_sim_overall[i] = total_dist[i]

    batch_sim_df = pd.DataFrame(found_sim_overall, columns=['Similarity'])
    batch_sim_df['Similarity'] = (batch_sim_df['Similarity']) * 100
    batch_sim_df['Date-Time'] = batch_cond['Capsule Start'] + (
            batch_cond['Capsule End'] - batch_cond['Capsule Start']) / 2

    batch_sim_df.index = pd.to_datetime(batch_sim_df['Date-Time'])
    batch_sim_df = batch_sim_df.drop('Date-Time', 1)

    meta_data.columns = "% Contribution to Dissimilarity from " + meta_data.columns

    meta_data['sum'] = meta_data.sum(axis=1)
    for c in meta_data.columns[:-1]:
        for i in meta_data.index:
            if meta_data['sum'][i] == 0:
                pass
            else:
                meta_data[c][i] = meta_data[c][i] / meta_data['sum'][i] * 100
    # meta_data= meta_data.div(meta_data["sum"], axis=0)*100
    meta_data = meta_data.drop("sum", 1)
    meta_data.index = batch_sim_df.index
    batch_sim_df = pd.concat([batch_sim_df, meta_data], axis=1)

    return batch_sim_df


[docs]def push_mps_results_batch(batch_sim_df, workbook_id, condition_name, Sheet_index):
    """
    This function pushes the % similarity score as a % dis-similarity time series signal to a new worksheet within the
    desired workbook in the case mps is in batch mode. In addition each variable's % contribution to the dis-similarity
    is also pushes as a signal per variable.

    Parameters
    ----------
    batch_sim_df: pd.DataFrame
        A dataframe or series that minimally has columns of
        ['Similarity', 'Date-Time'].
        This has the resulting similarity measure of each defined capsule with the centered datetime.
    workbook_id : str
        The Seeq ID of the source workbook
    condition_name: str
        Name of condition to leverage in the pushed item names.
    Sheet_index: int
        Integer detailing the index of the source worksheet.


    Returns
    -------
    end: bool
        Indicator for UI to display successful ending.
    """

    batch_sim_df.rename(columns={"Similarity": condition_name + "_Dissimilarity_measure"}, inplace=True)

    for col_name in batch_sim_df.columns:
        if 'Contribution' in col_name:
            batch_sim_df.rename(columns={col_name: col_name + " " + condition_name}, inplace=True)

    # get info from worksheet before push overwrites
    wb_id = spy.workbooks.search({'ID': workbook_id},
                                 quiet=True
                                 )

    workbook = spy.workbooks.pull(wb_id,
                                  include_referenced_workbooks=False, include_inventory=False,
                                  quiet=True, errors='catalog'
                                  )[0]

    worksheet_og = workbook.worksheets[Sheet_index]
    current_display_items = worksheet_og.display_items
    worksheet_name = "MPS results " + str(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    # push similarity signal
    push_result_2 = spy.push(data=batch_sim_df,
                             workbook=workbook_id,
                             worksheet=worksheet_name,
                             quiet=True
                             )

    # push worksheet back in after overwrite
    new_display_items = pd.concat([current_display_items, push_result_2], axis=0, sort=True)
    lane_ = current_display_items[current_display_items['Samples Display'] == 'Line']['Lane'].max()

    workbook = spy.workbooks.pull(wb_id,
                                  include_referenced_workbooks=False, include_inventory=False,
                                  quiet=True, errors='catalog'
                                  )[0]

    lane_count = 1
    for name in batch_sim_df.columns:
        i = new_display_items.loc[new_display_items['Name'] == name].index[0]
        new_display_items["Samples Display"].loc[i] = "Bars"
        new_display_items["Line Width"].loc[i] = 40
        new_display_items["Lane"].loc[i] = lane_ + lane_count
        lane_count += 1

    workbook.worksheets[worksheet_name].display_items = new_display_items
    workbook.worksheets[worksheet_name].display_range = worksheet_og.display_range

    spy.workbooks.push(workbook, quiet=True)

    push_result_2["Value Unit Of Measure"] = "%"
    push_result_2["Maximum Interpolation"] = "1 sec"
    spy.push(metadata=push_result_2, quiet=True)

    end = True

    return end


[docs]def push_mps_results(
        Return_top_x, min_idx_multivar, data_pull, workbook_id, condition_name, Sheet_index, grid):
    """
    This function pushes the % similarity score as a % dis-similarity time series signal to a new worksheet within the
    desired workbook in the case mps is in continuous mode. In addition each variable's % contribution to the
    dis-similarity is also pushes as a signal per variable.

    Parameters
    ----------
    Return_top_x: int
        Variable to limit number of top found capsules.
    min_idx_multivar: numpy.ndarray
        numpy array of three columns
        1st = found capsules similarity measurement (float 0 to 1)
        2nd = integer index of each found capsule relative to data_pull
        3rd = integer describing the duration/length of the found capsule (each integer is defined by griding in
        'pull_mps_data')
    data_pull: pd.DataFrame, pd.Series
        A dataframe or series that minimally has columns of "X, ..., Y" signals
        requested to be pulled (detailed in items_s_ref input variable) and 'Date-Time'
        with an index of timestamps.
        [X, ..., Y,'Date-Time'].
        This dataframe has all the time series data for the analysis/search area.
    workbook_id : str
        The Seeq ID of the source workbook.
    condition_name: str
        Name of condition to leverage in the pushed item names.
    Sheet_index: int
        Integer detailing the index of the source worksheet.
    grid: str
        Resolution/griding of the spy pull.


    Returns
    -------
    end: bool
        Indicator for UI to display successful ending.
    """
    worksheet_name = "MPS results " + str(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    grid_ = {'1sec': 1, '10sec': 10, '30sec': 30, '1min': 60, '5min': 300, '10min': 600, '30min': 1800}[grid]

    # if less than requested top results
    if len(min_idx_multivar) == 0:
        print("No Results Found")

    else:

        # Prep and unzip found data
        found_list = min_idx_multivar[:Return_top_x, 1].tolist()
        sim_list = min_idx_multivar[:Return_top_x, 0].tolist()
        known_length_list = min_idx_multivar[:Return_top_x, 2].tolist()

        # use index to find datetime
        push_cond = pd.DataFrame(data_pull['Date-Time'].iloc[found_list])
        push_cond.columns = ["Capsule Start"]
        push_cond["Capsule End"] = push_cond["Capsule Start"]

        # add by known length

        for i in range(len(known_length_list)):
            push_cond["Capsule End"].iloc[i] = push_cond["Capsule Start"].iloc[i] + timedelta(
                seconds=known_length_list[i] * grid_)

        push_cond = push_cond.reset_index(drop=True)
        push_cond['Date-Time'] = pd.to_datetime(push_cond.index)
        push_cond['Similarity'] = sim_list

        # get info from worksheet before push overwrites
        wb_id = spy.workbooks.search({'ID': workbook_id},
                                     quiet=True
                                     )

        workbook = spy.workbooks.pull(wb_id,
                                      include_referenced_workbooks=False, include_inventory=False,
                                      quiet=True, errors='catalog'
                                      )[0]

        worksheet_og = workbook.worksheets[Sheet_index]
        current_display_items = worksheet_og.display_items
        lane_ = current_display_items[current_display_items['Samples Display'] == 'Line']['Lane'].max()

        # push found conditions
        push_cond.reset_index(drop=True, inplace=True)

        push_result = spy.push(data=push_cond, workbook=workbook_id,
                               worksheet=worksheet_name,
                               metadata=pd.DataFrame([{
                                   'Name': condition_name,
                                   'Type': 'Condition',
                                   'Maximum Duration': '20d'
                               }]),
                               quiet=True
                               )

        # create dataframe to push similarity signal
        push_sig = pd.DataFrame(min_idx_multivar).drop([1, 2, 3, 4], 1)
        push_sig = push_sig.iloc[:Return_top_x]

        temp_c_list = data_pull.columns[:-1]
        temp_c_list = ["% Contribution to Dissimilarity from " + x + " " + condition_name for x in temp_c_list]
        name_ = condition_name + "_Dissimilarity_measure"
        temp_c_list.insert(0, name_)

        push_sig.columns = temp_c_list

        push_sig['temp'] = push_cond["Capsule Start"]

        for i in range(len(known_length_list)):
            push_sig['temp'].iloc[i] = push_sig['temp'].iloc[i] + (
                    timedelta(seconds=(known_length_list[i]) / 2) * grid_)

        push_sig.index = pd.to_datetime(push_sig.temp)
        push_sig = push_sig.drop('temp', 1)

        # change to dissimilarity
        push_sig[name_] = 100 - push_sig[name_]

        # push similarity signal
        push_result_2 = spy.push(data=push_sig,
                                 workbook=workbook_id,
                                 worksheet=worksheet_name,
                                 quiet=True
                                 )

        push_result_2["Value Unit Of Measure"] = "%"
        push_result_2["Maximum Interpolation"] = "1 sec"
        spy.push(metadata=push_result_2, quiet=True)

        # push worksheet back in after overwrite
        new_display_items = pd.concat([current_display_items, push_result, push_result_2], axis=0, sort=True)

        workbook = spy.workbooks.pull(wb_id,
                                      include_referenced_workbooks=False, include_inventory=False,
                                      quiet=True, errors='catalog'
                                      )[0]

        lane_count = 1
        for name in push_sig.columns:
            i = new_display_items.loc[new_display_items['Name'] == name].index[0]
            new_display_items["Samples Display"].loc[i] = "Bars"
            new_display_items["Line Width"].loc[i] = 40
            new_display_items["Lane"].loc[i] = lane_ + lane_count
            lane_count += 1

        new_display_items.reset_index(drop=True, inplace=True)

        workbook.worksheets[worksheet_name].display_items = new_display_items
        workbook.worksheets[worksheet_name].display_range = worksheet_og.display_range
        spy.workbooks.push(workbook, quiet=True)

        end = True

        return end