Source code for jointly.synchronizer

import os
from typing import Dict, Optional

import numpy as np
import pandas as pd
from scipy.signal import correlate

from . import ShakeExtractor, helpers
from .abstract_extractor import AbstractExtractor
from .helpers import normalize, get_equidistant_signals
from .log import logger
from .synchronization_errors import StartEqualsEndError
from .types import SourceDict, ResultTableSpec, SyncPairTimeshift, SyncPairs


[docs]class Synchronizer: @property def extractor(self) -> AbstractExtractor: """Get the current extractor""" return self._extractor @extractor.setter def extractor(self, value: AbstractExtractor): if not issubclass(type(value), AbstractExtractor): raise TypeError("Extractor needs to be a subclass of AbstractExtractor.") self._extractor = value def __init__( self, sources: SourceDict, reference_source_name: str, extractor: Optional[AbstractExtractor] = None, sampling_freq: Optional[float] = None, ): """ Create a new synchronizer. Synchronizer objects are used to remove offsets and clock offsets by stretching and moving reference points detected by an extractor. :param sources: A SourceDict to describe the input data :param reference_source_name: name of the sensor to be used as reference. Other sensors will be made synchronous to this sensor, and data from this sensor will not be modified. :param extractor: This will be used to find synchronization points in the source data. If None, it defaults to a ShakeExtractor instance :param sampling_freq: Override the frequency used to resample input data. If None, it defaults to the maximum input frequency """ self.sources = sources self.ref_source_name = reference_source_name self._check_sources() self.extractor = extractor if extractor is not None else ShakeExtractor() self.ref_signals = self._prepare_ref_signals() self.sampling_freq = ( sampling_freq if sampling_freq is not None else helpers.get_max_ref_frequency(self.ref_signals) ) def _check_sources(self): """Verifies that the source dict adheres to the required format and that the reference source is available""" for source_name, source in self.sources.items(): if "data" not in source or "ref_column" not in source: raise ValueError( "Each source needs to have a `data` and a `ref_column` property" ) if not isinstance(source["data"], pd.DataFrame): raise ValueError( "The `data` property of each source must contain a DatFrame" ) if not isinstance(source["data"].index, pd.DatetimeIndex): raise ValueError( "The `data` DataFrame must have a pd.DatetimeIndex for each source" ) if source["data"].index.duplicated().any(): raise ValueError( "The input dataframe must not have duplicate index values, " "convert the data into a normalized wide format" ) if ( not isinstance(source["ref_column"], str) or source["ref_column"] not in source["data"].columns ): raise ValueError( "Each source must have a string specifying the reference column, and the reference" "column must be available in the source's DataFrame" ) if self.ref_source_name not in self.sources.keys(): raise ValueError( "The reference source name must be available in the source dict" ) def _prepare_ref_signals(self) -> pd.DataFrame: """ Collect the reference columns from all sources and join them into a single dataframe. Each reference column is named equal to the name of the source it comes from. :return: normalized reference signals """ reference_signals = pd.DataFrame() for source_name, source in self.sources.items(): signal = source["data"][source["ref_column"]].dropna() reference_signals = reference_signals.join(signal, how="outer") reference_signals.rename( columns={source["ref_column"]: source_name}, inplace=True ) reference_signals = reference_signals.apply(normalize) return reference_signals @staticmethod def _get_timeshift_pair( dataframe: pd.DataFrame, ref_col: str, sig_col: str, segments: SyncPairs ) -> SyncPairTimeshift: """ Returns timeshifts to synchronize sig_col to ref_col. Expects equidistant sampled signals. :param dataframe: reference signal dataframe :param ref_col: name of the reference signal in segments :param sig_col: name of the target signal in segments :param segments: all detected synchronization pairs :return: timeshift to align the first and second synchronization point for the target signal to the reference signal """ timeshifts = {} for index, segment in enumerate(["first", "second"]): logger.debug( f"Calculate timeshift of {segment} segment " f"for {sig_col} to {ref_col}." ) # reference signal segment data extraction ref_start, ref_end, ref_data = helpers.get_segment_data( dataframe, segments, ref_col, segment ) sig_start, sig_end, sig_data = helpers.get_segment_data( dataframe, segments, sig_col, segment ) # calculate cross-correlation of segments cross_corr = correlate(ref_data, sig_data) shift_in_samples = np.argmax(cross_corr) - len(sig_data) + 1 # get timestamp at which sig_segment must start to sync signals max_corr_ts = dataframe.index[ dataframe.index.get_loc(ref_start, method="nearest") + shift_in_samples ] logger.debug( f"Highest correlation with start at " f"{max_corr_ts} with {np.max(cross_corr)}." ) # calculate timeshift to move signal to maximize correlation timeshifts[segment] = max_corr_ts - sig_start logger.debug("Timeshift is {}.".format(str(timeshifts[segment]))) return timeshifts def _calculate_stretch_factors(self) -> pd.DataFrame: """ Calculate the stretch factor that aligns each reference signal to the reference signal of the reference source. It immediately applies these stretch factors to a copy of ``self.ref_signals``. :return: a copy of self.ref_signals with the stretch factors applied. """ ref_signals = self.ref_signals.copy() start_time = ref_signals.index.min() # Get equidistantly sampled reference signals for the cross correlation to work df_equidistant = get_equidistant_signals(ref_signals, self.sampling_freq) sync_pairs = self.extractor.get_segments(df_equidistant) helpers.verify_segments(ref_signals.columns, sync_pairs) for source in df_equidistant.columns: if source == self.ref_source_name: continue timeshifts = Synchronizer._get_timeshift_pair( df_equidistant, self.ref_source_name, source, sync_pairs ) logger.debug( f"Timedelta between shifts before stretching: " f"{timeshifts['first'] - timeshifts['second']}" ) try: stretch_factor = helpers.get_stretch_factor( sync_pairs[source], timeshifts ) except ZeroDivisionError: raise StartEqualsEndError( "First and last segment have been identified as exactly the same. Bad window, maybe?" ) logger.info(f"Stretch factor for {source}: {stretch_factor}") # stretch signal and exchange it in dataframe signal_stretched = helpers.stretch_signals( pd.DataFrame(ref_signals[source]).dropna(), stretch_factor, start_time, ) ref_signals = ( ref_signals.drop(source, axis="columns") .join(signal_stretched, how="outer") .astype(pd.SparseDtype("float")) ) self.sources[source]["stretch_factor"] = stretch_factor return ref_signals def _calculate_timeshifts(self, stretched_ref_signals: pd.DataFrame): """ Calculate the shift necessary to align the stretched reference signals to the not-stretched reference sensor. :param stretched_ref_signals: a copy of self.ref_signals that has been stretched to align the duration between the synchronization points to the duration between them in the reference sensor """ # Resample again with stretched signal df_equi = get_equidistant_signals(stretched_ref_signals, self.sampling_freq) segments = self.extractor.get_segments(df_equi) helpers.verify_segments(stretched_ref_signals.columns, segments) for source in df_equi.columns: if source == self.ref_source_name: continue timeshifts = Synchronizer._get_timeshift_pair( df_equi, self.ref_source_name, source, segments ) timedelta = timeshifts["first"] - timeshifts["second"] if timedelta > pd.Timedelta(0): logger.warning( f"Timedelta between shifts after stretching: {timedelta}." f"This should be very small: the timedelta to the reference signal" f"should be equal for both start and end so a simple offset aligns the" f"signals perfectly." ) logger.info("Timeshift for {}: {}".format(source, timeshifts["first"])) self.sources[source]["timeshift"] = timeshifts["first"] def _calculate_sync_params(self): """ This function calculates the synchronization parameters to sync all signals to the reference signal. It stores the result in ``self.sources``, in the keys ``timeshift`` and ``stretch_factor``. """ self.sources[self.ref_source_name]["timeshift"] = None self.sources[self.ref_source_name]["stretch_factor"] = 1 # Firstly, determine stretch factor and get stretched reference signals stretched_ref_signals = self._calculate_stretch_factors() # Secondly, get timeshift for the stretched signals self._calculate_timeshifts(stretched_ref_signals)
[docs] def get_sync_params(self, recalculate: bool = False): """ Get the synchronization params. If they have not been calculated yet, they will be. :param recalculate: force calculation, even if it was already done before :return: the synchronization params for each source, i.e., each timeshift and stretch factor """ selected_keys = ["timeshift", "stretch_factor"] if recalculate or "timeshift" not in self.sources[self.ref_source_name]: self._calculate_sync_params() return { source_name: { key: value for key, value in source.items() if key in selected_keys } for source_name, source in self.sources.items() }
[docs] def get_synced_data(self, recalculate: bool = False) -> Dict[str, pd.DataFrame]: """ Synchronize the input data. :param recalculate: force recalculating the synchronization parameters :return: a dictionary of the shifted and stretched source signals """ self.get_sync_params(recalculate) synced_data = {} start_time = self.ref_signals.index.min() for source_name, source in self.sources.items(): data = source["data"].copy() stretch_factor, timeshift = source["stretch_factor"], source["timeshift"] if stretch_factor != 1: data = helpers.stretch_signals(data, stretch_factor, start_time) if timeshift is not None: data = data.shift(1, freq=timeshift) synced_data[source_name] = data return synced_data
[docs] def save_pickles(self, target_dir: str) -> Dict[str, pd.DataFrame]: """ Save a pickled, synced, dataframe for each source file. Does not save a total table. Sync parameters are saved as ``SYNC.csv``. :param target_dir: target directory for the export files :return: the synced data, plus a sync parameter dataframe in the dictionary entry with the key "SYNC". """ sync_params = pd.DataFrame(self.get_sync_params()) synced_data = self.get_synced_data() sync_params.to_csv(os.path.join(target_dir, "SYNC.csv")) for source_name, synced_df in synced_data.items(): synced_df.to_pickle( os.path.join(target_dir, f"{source_name.upper()}.PICKLE") ) return {**synced_data, "SYNC": sync_params}
[docs] def save_data( self, target_dir: str, tables: Optional[ResultTableSpec] = None, save_total_table: bool = True, ): """ Export synchronized data. Two formats are possible: if ``tables`` is given, a file for each root key is created containing the columns from the sensors specified as the keys on the second level. This can be used to create a file for each sensor type, see ``ResultTableSpec`` for an example. A ``SYNC.csv`` is always exported to store the synchronization parameters that have been calculated. :param target_dir: target directory for the export files :param tables: ResultTableSpec to specify the export format, or None :param save_total_table: exports an outer join over all synchronized dataframes """ if tables is not None and "SYNC" in tables.keys(): raise ValueError( "SYNC must not be one of the table names. " "It is reserved for the synchronization parameters." ) if save_total_table and tables is not None: if "TOTAL" in tables.keys(): raise ValueError( "TOTAL must not be one of the table names, " "if the table with all data should be saved." ) sync_params = self.get_sync_params() synced_data = self.get_synced_data() # Save sync params pd.DataFrame(sync_params).to_csv(os.path.join(target_dir, "SYNC.csv")) # Save custom tables logger.info(tables) if tables is not None: for table_name, table_spec in tables.items(): if len(table_spec) == 0: logger.warning( f"Table entry {table_name} is missing any requested columns" ) continue table_df = pd.DataFrame() for source_name, source_columns in table_spec.items(): # create dataframe for each source source_df = pd.DataFrame() for column in source_columns: try: data = synced_data[source_name][column] except KeyError: raise ValueError( f"Requested non-existing {source_name}->{column}" ) # join selected signals to device dataframe source_df = source_df.join(data, how="outer") if not source_df.empty: # add device signals to general dataframe source_df = source_df.rename( lambda col_name: f"{source_name}_{col_name}", axis="columns", ) table_df = table_df.join(source_df, how="outer") table_df.dropna(axis="index", how="all", inplace=True) table_df.to_csv(os.path.join(target_dir, f"{table_name}.csv")) # Save table with total data if save_total_table: total_table = pd.DataFrame() for source_name, data in synced_data.items(): source_df = data.rename( lambda col_name: f"{source_name}_{col_name}", axis="columns", ) total_table = total_table.join(source_df, how="outer") total_table.to_csv(os.path.join(target_dir, "TOTAL.csv"))