Module data_utils.padding
Expand source code
from comet_ml import Experiment # Comet.ml can log training metrics, parameters, do version control and parameter optimization
import torch # PyTorch to create and apply deep learning models
import dask.dataframe as dd # Dask to handle big data in dataframes
import numpy as np # NumPy to handle numeric and NaN operations
import warnings # Print warnings for bad practices
from . import utils # Generic and useful methods
from . import search_explore # Methods to search and explore data
from . import embedding # Embeddings and other categorical features handling methods
# Ignore Dask's 'meta' warning
warnings.filterwarnings("ignore", message="`meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.")
# Methods
def get_sequence_length_dict(df, id_column='subject_id', ts_column='ts'):
'''Creates a dictionary with the original sequence lengths of a dataframe.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame
Data in a Pandas dataframe format which will be padded and converted
to the requested data type.
id_column : string or int, default 'subject_id'
Name of the column which corresponds to the subject identifier in the
dataframe.
ts_column : string or int, default 'ts'
Name of the column which corresponds to the timestamp in the
dataframe.
Returns
-------
seq_len_dict : dictionary, default None
Dictionary containing the original sequence lengths of the dataframe.
The keys should be the sequence identifiers (the numbers obtained from
the id_column) and the values should be the length of each sequence.
'''
if isinstance(id_column, int) and isinstance(ts_column, int):
# Convert the column indices to the column names
column_names = list(df.columns)
id_column = column_names[id_column]
ts_column = column_names[ts_column]
# Dictionary containing the sequence length (number of temporal events) of each sequence (patient)
seq_len_df = df.groupby(id_column)[ts_column].count()
seq_len_dict = dict([(idx, val) for idx, val in list(zip(seq_len_df.index, seq_len_df.to_numpy()))])
return seq_len_dict
def dataframe_to_padded_tensor(df, seq_len_dict=None, id_column='subject_id',
ts_column='ts', label_column='label',
bool_feat=None, data_type='PyTorch',
padding_value=999999, total_length=None,
inplace=False):
'''Converts a Pandas dataframe into a padded NumPy array or PyTorch Tensor.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame
Data in a Pandas dataframe format which will be padded and converted
to the requested data type.
seq_len_dict : dictionary, default None
Dictionary containing the original sequence lengths of the dataframe.
The keys should be the sequence identifiers (the numbers obtained from
the id_column) and the values should be the length of each sequence.
id_column : string, default 'subject_id'
Name of the column which corresponds to the subject identifier in the
dataframe.
ts_column : string, default 'ts'
Name of the column which corresponds to the timestamp in the
dataframe.
bool_feat : string or list of strings, default None
Name(s) of the boolean feature(s) of the dataframe. In order to prevent
confounding padding values with encodings, these features must have
their padding values replaced with 0. If not specified, the method
will automatically look for boolean columns in the dataframe. If you
don't want any feature to be treated as a boolean dtype, set `bool_feat=[]`
data_type : string, default 'PyTorch'
Indication of what kind of output data type is desired. In case it's
set as 'NumPy', the function outputs a NumPy array. If it's 'PyTorch',
the function outputs a PyTorch tensor.
padding_value : numeric
Value to use in the padding, to fill the sequences.
total_length : int, default None
If not None, the output will be padded to have length total_length.
This method will throw ValueError if total_length is less than the
max sequence length in sequence.
inplace : bool, default False
If set to True, the original dataframe will be used and modified
directly. Otherwise, a copy will be created and returned, without
changing the original dataframe.
Returns
-------
arr : torch.Tensor or numpy.ndarray
PyTorch tensor or NumPy array version of the dataframe, after being
padded with the specified padding value to have a fixed sequence
length.
'''
if not inplace:
# Make a copy of the data to avoid potentially unwanted changes to the original dataframe
data_df = df.copy()
else:
# Use the original dataframe
data_df = df
if seq_len_dict is None:
# Find the sequence lengths and store them in a dictionary
seq_len_dict = get_sequence_length_dict(data_df, id_column, ts_column)
# Fetch the number of unique sequence IDs
n_ids = data_df[id_column].nunique()
if isinstance(df, dd.DataFrame):
# Make sure that the number of unique values are computed, in case we're using Dask
n_ids = n_ids.compute()
# Get the number of columns in the dataframe
n_inputs = len(data_df.columns)
if total_length is None:
# Max sequence length in the current data
total_length = seq_len_dict[max(seq_len_dict, key=seq_len_dict.get)]
if n_ids > 1:
# Making a padded numpy array version of the dataframe (all index has the same sequence length as the one with the max)
arr = np.ones((n_ids, total_length, n_inputs)) * padding_value
# Fetch a list with all the unique identifiers (e.g. each patient in the dataset)
unique_ids = data_df[id_column].unique()
# Iterator that outputs each unique identifier
id_iter = iter(unique_ids)
# Count the iterations of ids
count = 0
# Assign each value from the dataframe to the numpy array
for idt in id_iter:
arr[count, :seq_len_dict[idt], :] = data_df[data_df[id_column] == idt].to_numpy()
arr[count, seq_len_dict[idt]:, :] = padding_value
count += 1
else:
# Making a padded numpy array version of the dataframe (all index has the same sequence length as the one with the max)
arr = np.ones((total_length, n_inputs)) * padding_value
# Assign each value from the dataframe to the numpy array
idt = data_df[id_column].iloc[0]
arr[:seq_len_dict[idt], :] = data_df.to_numpy()
arr[seq_len_dict[idt]:, :] = padding_value
if bool_feat is None:
# Find the boolean columns in the dataframe
bool_feat = search_explore.list_boolean_columns(data_df)
# Make sure that none of the ID columns are considered boolean
bool_feat = list(set(bool_feat) - set([id_column, ts_column, label_column]))
# Get the indices of the boolean features
bool_feat = [search_explore.find_col_idx(data_df, feature) for feature in bool_feat]
elif isinstance(bool_feat, str):
# Get the index of the boolean feature
bool_feat = search_explore.find_col_idx(data_df, bool_feat)
# Make sure that the boolean feature names are in a list format
bool_feat = [bool_feat]
elif not isinstance(bool_feat, list):
raise Exception(f'ERROR: The `bool_feat` argument must be specified as either a single string or a list of strings. Received input with type {type(bool_feat)}.')
elif all(isinstance(feat, str) for feat in bool_feat):
# Convert from the feature's name to its index
bool_feat = [search_explore.find_col_idx(data_df, feat) for feat in bool_feat]
if len(bool_feat) > 0:
if n_ids > 1:
# Iterator that outputs each unique identifier
id_iter = iter(unique_ids)
# Count the iterations of ids
count = 0
# Replace each padding value in the boolean features with zero
for idt in id_iter:
arr[count, seq_len_dict[idt]:, bool_feat] = 0
count += 1
else:
# Replace each padding value in the boolean features with zero
idt = data_df[id_column].iloc[0]
arr[seq_len_dict[idt]:, bool_feat] = 0
# Make sure that the data type asked for is a string
if not isinstance(data_type, str):
raise Exception('ERROR: Please provide the desirable data type in a string format.')
if data_type.lower() == 'numpy':
return arr
elif data_type.lower() == 'pytorch':
return torch.from_numpy(arr)
else:
raise Exception('ERROR: Unavailable data type. Please choose either NumPy or PyTorch.')
def sort_by_seq_len(data, seq_len_dict, labels=None, id_column=0):
'''Sort the data by sequence length in order to correctly apply it to a
PyTorch neural network.
Parameters
----------
data : torch.Tensor
Data tensor on which sorting by sequence length will be applied.
seq_len_dict : dict
Dictionary containing the sequence lengths for each index of the
original dataframe. This allows to ignore the padding done in
the fixed sequence length tensor.
labels : torch.Tensor, default None
Labels corresponding to the data used, either specified in the input
or all the data that the interpreter has.
id_column : int, default 0
Number of the column which corresponds to the subject identifier in
the data tensor.
Returns
-------
sorted_data : torch.Tensor, default None
Data tensor already sorted by sequence length.
sorted_labels : torch.Tensor, default None
Labels tensor already sorted by sequence length. Only outputed if the
labels data is specified in the input.
x_lengths : list of int
Sorted list of sequence lengths, relative to the input data.
'''
# Get the original lengths of the sequences, for the input data
x_lengths = [seq_len_dict[id] for id in list(data[:, 0, id_column].numpy())]
is_sorted = all(x_lengths[i] >= x_lengths[i+1] for i in range(len(x_lengths)-1))
if is_sorted is True:
# Do nothing if it's already sorted
sorted_data = data
sorted_labels = labels
else:
# Sorted indices to get the data sorted by sequence length
data_sorted_idx = list(np.argsort(x_lengths)[::-1])
# Sort the x_lengths array by descending sequence length
x_lengths = [x_lengths[idx] for idx in data_sorted_idx]
# Sort the data by descending sequence length
sorted_data = data[data_sorted_idx, :, :]
if labels is not None:
# Sort the labels by descending sequence length
sorted_labels = labels[data_sorted_idx, :]
if labels is None:
return sorted_data, x_lengths
else:
return sorted_data, sorted_labels, x_lengths
def pad_list(x_list, length, padding_value=999999):
'''Pad a list with a specific padding value until the desired length is
met.
Parameters
----------
x_list : list
List which will be padded.
length : int
Desired length for the final padded list.
padding_value : numeric
Value to use in the padding, to fill the sequences.
Returns
-------
x_list : list
Resulting padded list'''
return x_list + [padding_value] * (length - len(x_list))
Functions
def dataframe_to_padded_tensor(df, seq_len_dict=None, id_column='subject_id', ts_column='ts', label_column='label', bool_feat=None, data_type='PyTorch', padding_value=999999, total_length=None, inplace=False)
-
Converts a Pandas dataframe into a padded NumPy array or PyTorch Tensor.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
- Data in a Pandas dataframe format which will be padded and converted to the requested data type.
seq_len_dict
:dictionary
, defaultNone
- Dictionary containing the original sequence lengths of the dataframe. The keys should be the sequence identifiers (the numbers obtained from the id_column) and the values should be the length of each sequence.
id_column
:string
, default'subject_id'
- Name of the column which corresponds to the subject identifier in the dataframe.
ts_column
:string
, default'ts'
- Name of the column which corresponds to the timestamp in the dataframe.
bool_feat
:string
orlist
ofstrings
, defaultNone
- Name(s) of the boolean feature(s) of the dataframe. In order to prevent
confounding padding values with encodings, these features must have
their padding values replaced with 0. If not specified, the method
will automatically look for boolean columns in the dataframe. If you
don't want any feature to be treated as a boolean dtype, set
bool_feat=[]
data_type
:string
, default'PyTorch'
- Indication of what kind of output data type is desired. In case it's set as 'NumPy', the function outputs a NumPy array. If it's 'PyTorch', the function outputs a PyTorch tensor.
padding_value
:numeric
- Value to use in the padding, to fill the sequences.
total_length
:int
, defaultNone
- If not None, the output will be padded to have length total_length. This method will throw ValueError if total_length is less than the max sequence length in sequence.
inplace
:bool
, defaultFalse
- If set to True, the original dataframe will be used and modified directly. Otherwise, a copy will be created and returned, without changing the original dataframe.
Returns
arr
:torch.Tensor
ornumpy.ndarray
- PyTorch tensor or NumPy array version of the dataframe, after being padded with the specified padding value to have a fixed sequence length.
Expand source code
def dataframe_to_padded_tensor(df, seq_len_dict=None, id_column='subject_id', ts_column='ts', label_column='label', bool_feat=None, data_type='PyTorch', padding_value=999999, total_length=None, inplace=False): '''Converts a Pandas dataframe into a padded NumPy array or PyTorch Tensor. Parameters ---------- df : pandas.DataFrame or dask.DataFrame Data in a Pandas dataframe format which will be padded and converted to the requested data type. seq_len_dict : dictionary, default None Dictionary containing the original sequence lengths of the dataframe. The keys should be the sequence identifiers (the numbers obtained from the id_column) and the values should be the length of each sequence. id_column : string, default 'subject_id' Name of the column which corresponds to the subject identifier in the dataframe. ts_column : string, default 'ts' Name of the column which corresponds to the timestamp in the dataframe. bool_feat : string or list of strings, default None Name(s) of the boolean feature(s) of the dataframe. In order to prevent confounding padding values with encodings, these features must have their padding values replaced with 0. If not specified, the method will automatically look for boolean columns in the dataframe. If you don't want any feature to be treated as a boolean dtype, set `bool_feat=[]` data_type : string, default 'PyTorch' Indication of what kind of output data type is desired. In case it's set as 'NumPy', the function outputs a NumPy array. If it's 'PyTorch', the function outputs a PyTorch tensor. padding_value : numeric Value to use in the padding, to fill the sequences. total_length : int, default None If not None, the output will be padded to have length total_length. This method will throw ValueError if total_length is less than the max sequence length in sequence. inplace : bool, default False If set to True, the original dataframe will be used and modified directly. Otherwise, a copy will be created and returned, without changing the original dataframe. Returns ------- arr : torch.Tensor or numpy.ndarray PyTorch tensor or NumPy array version of the dataframe, after being padded with the specified padding value to have a fixed sequence length. ''' if not inplace: # Make a copy of the data to avoid potentially unwanted changes to the original dataframe data_df = df.copy() else: # Use the original dataframe data_df = df if seq_len_dict is None: # Find the sequence lengths and store them in a dictionary seq_len_dict = get_sequence_length_dict(data_df, id_column, ts_column) # Fetch the number of unique sequence IDs n_ids = data_df[id_column].nunique() if isinstance(df, dd.DataFrame): # Make sure that the number of unique values are computed, in case we're using Dask n_ids = n_ids.compute() # Get the number of columns in the dataframe n_inputs = len(data_df.columns) if total_length is None: # Max sequence length in the current data total_length = seq_len_dict[max(seq_len_dict, key=seq_len_dict.get)] if n_ids > 1: # Making a padded numpy array version of the dataframe (all index has the same sequence length as the one with the max) arr = np.ones((n_ids, total_length, n_inputs)) * padding_value # Fetch a list with all the unique identifiers (e.g. each patient in the dataset) unique_ids = data_df[id_column].unique() # Iterator that outputs each unique identifier id_iter = iter(unique_ids) # Count the iterations of ids count = 0 # Assign each value from the dataframe to the numpy array for idt in id_iter: arr[count, :seq_len_dict[idt], :] = data_df[data_df[id_column] == idt].to_numpy() arr[count, seq_len_dict[idt]:, :] = padding_value count += 1 else: # Making a padded numpy array version of the dataframe (all index has the same sequence length as the one with the max) arr = np.ones((total_length, n_inputs)) * padding_value # Assign each value from the dataframe to the numpy array idt = data_df[id_column].iloc[0] arr[:seq_len_dict[idt], :] = data_df.to_numpy() arr[seq_len_dict[idt]:, :] = padding_value if bool_feat is None: # Find the boolean columns in the dataframe bool_feat = search_explore.list_boolean_columns(data_df) # Make sure that none of the ID columns are considered boolean bool_feat = list(set(bool_feat) - set([id_column, ts_column, label_column])) # Get the indices of the boolean features bool_feat = [search_explore.find_col_idx(data_df, feature) for feature in bool_feat] elif isinstance(bool_feat, str): # Get the index of the boolean feature bool_feat = search_explore.find_col_idx(data_df, bool_feat) # Make sure that the boolean feature names are in a list format bool_feat = [bool_feat] elif not isinstance(bool_feat, list): raise Exception(f'ERROR: The `bool_feat` argument must be specified as either a single string or a list of strings. Received input with type {type(bool_feat)}.') elif all(isinstance(feat, str) for feat in bool_feat): # Convert from the feature's name to its index bool_feat = [search_explore.find_col_idx(data_df, feat) for feat in bool_feat] if len(bool_feat) > 0: if n_ids > 1: # Iterator that outputs each unique identifier id_iter = iter(unique_ids) # Count the iterations of ids count = 0 # Replace each padding value in the boolean features with zero for idt in id_iter: arr[count, seq_len_dict[idt]:, bool_feat] = 0 count += 1 else: # Replace each padding value in the boolean features with zero idt = data_df[id_column].iloc[0] arr[seq_len_dict[idt]:, bool_feat] = 0 # Make sure that the data type asked for is a string if not isinstance(data_type, str): raise Exception('ERROR: Please provide the desirable data type in a string format.') if data_type.lower() == 'numpy': return arr elif data_type.lower() == 'pytorch': return torch.from_numpy(arr) else: raise Exception('ERROR: Unavailable data type. Please choose either NumPy or PyTorch.')
def get_sequence_length_dict(df, id_column='subject_id', ts_column='ts')
-
Creates a dictionary with the original sequence lengths of a dataframe.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
- Data in a Pandas dataframe format which will be padded and converted to the requested data type.
id_column
:string
orint
, default'subject_id'
- Name of the column which corresponds to the subject identifier in the dataframe.
ts_column
:string
orint
, default'ts'
- Name of the column which corresponds to the timestamp in the dataframe.
Returns
seq_len_dict
:dictionary
, defaultNone
- Dictionary containing the original sequence lengths of the dataframe. The keys should be the sequence identifiers (the numbers obtained from the id_column) and the values should be the length of each sequence.
Expand source code
def get_sequence_length_dict(df, id_column='subject_id', ts_column='ts'): '''Creates a dictionary with the original sequence lengths of a dataframe. Parameters ---------- df : pandas.DataFrame or dask.DataFrame Data in a Pandas dataframe format which will be padded and converted to the requested data type. id_column : string or int, default 'subject_id' Name of the column which corresponds to the subject identifier in the dataframe. ts_column : string or int, default 'ts' Name of the column which corresponds to the timestamp in the dataframe. Returns ------- seq_len_dict : dictionary, default None Dictionary containing the original sequence lengths of the dataframe. The keys should be the sequence identifiers (the numbers obtained from the id_column) and the values should be the length of each sequence. ''' if isinstance(id_column, int) and isinstance(ts_column, int): # Convert the column indices to the column names column_names = list(df.columns) id_column = column_names[id_column] ts_column = column_names[ts_column] # Dictionary containing the sequence length (number of temporal events) of each sequence (patient) seq_len_df = df.groupby(id_column)[ts_column].count() seq_len_dict = dict([(idx, val) for idx, val in list(zip(seq_len_df.index, seq_len_df.to_numpy()))]) return seq_len_dict
def pad_list(x_list, length, padding_value=999999)
-
Pad a list with a specific padding value until the desired length is met.
Parameters
x_list
:list
- List which will be padded.
length
:int
- Desired length for the final padded list.
padding_value
:numeric
- Value to use in the padding, to fill the sequences.
Returns
x_list
:list
- Resulting padded list
Expand source code
def pad_list(x_list, length, padding_value=999999): '''Pad a list with a specific padding value until the desired length is met. Parameters ---------- x_list : list List which will be padded. length : int Desired length for the final padded list. padding_value : numeric Value to use in the padding, to fill the sequences. Returns ------- x_list : list Resulting padded list''' return x_list + [padding_value] * (length - len(x_list))
def sort_by_seq_len(data, seq_len_dict, labels=None, id_column=0)
-
Sort the data by sequence length in order to correctly apply it to a PyTorch neural network.
Parameters
data
:torch.Tensor
- Data tensor on which sorting by sequence length will be applied.
seq_len_dict
:dict
- Dictionary containing the sequence lengths for each index of the original dataframe. This allows to ignore the padding done in the fixed sequence length tensor.
labels
:torch.Tensor
, defaultNone
- Labels corresponding to the data used, either specified in the input or all the data that the interpreter has.
id_column
:int
, default0
- Number of the column which corresponds to the subject identifier in the data tensor.
Returns
sorted_data
:torch.Tensor
, defaultNone
- Data tensor already sorted by sequence length.
sorted_labels
:torch.Tensor
, defaultNone
- Labels tensor already sorted by sequence length. Only outputed if the labels data is specified in the input.
x_lengths
:list
ofint
- Sorted list of sequence lengths, relative to the input data.
Expand source code
def sort_by_seq_len(data, seq_len_dict, labels=None, id_column=0): '''Sort the data by sequence length in order to correctly apply it to a PyTorch neural network. Parameters ---------- data : torch.Tensor Data tensor on which sorting by sequence length will be applied. seq_len_dict : dict Dictionary containing the sequence lengths for each index of the original dataframe. This allows to ignore the padding done in the fixed sequence length tensor. labels : torch.Tensor, default None Labels corresponding to the data used, either specified in the input or all the data that the interpreter has. id_column : int, default 0 Number of the column which corresponds to the subject identifier in the data tensor. Returns ------- sorted_data : torch.Tensor, default None Data tensor already sorted by sequence length. sorted_labels : torch.Tensor, default None Labels tensor already sorted by sequence length. Only outputed if the labels data is specified in the input. x_lengths : list of int Sorted list of sequence lengths, relative to the input data. ''' # Get the original lengths of the sequences, for the input data x_lengths = [seq_len_dict[id] for id in list(data[:, 0, id_column].numpy())] is_sorted = all(x_lengths[i] >= x_lengths[i+1] for i in range(len(x_lengths)-1)) if is_sorted is True: # Do nothing if it's already sorted sorted_data = data sorted_labels = labels else: # Sorted indices to get the data sorted by sequence length data_sorted_idx = list(np.argsort(x_lengths)[::-1]) # Sort the x_lengths array by descending sequence length x_lengths = [x_lengths[idx] for idx in data_sorted_idx] # Sort the data by descending sequence length sorted_data = data[data_sorted_idx, :, :] if labels is not None: # Sort the labels by descending sequence length sorted_labels = labels[data_sorted_idx, :] if labels is None: return sorted_data, x_lengths else: return sorted_data, sorted_labels, x_lengths