Module data_utils.datasets
Expand source code
from torch.utils.data import Dataset # Pytorch base dataset class
import inspect # Inspect methods and their arguments
from glob import glob # List files that follow certain path and name rules
from . import padding # Padding and variable sequence length related methods
from .embedding import embedding_bag_pipeline # Categorical embedding method
import data_utils as du
# Pandas to handle the data in dataframes
if du.use_modin is True:
import modin.pandas as pd
else:
import pandas as pd
class Tabular_Dataset(Dataset):
'''A dataset object for tabular data, which separates features from
labels, and allows to iteratively load data.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame
Dataframe used to search for the label column.
arr : torch.Tensor or numpy.ndarray
The data itself which will be separated into features and labels,
to the be loaded in this dataset or following dataloader objects.
label_name : string, default None
Name of the label column.
'''
def __init__(self, df, arr, label_name=None):
# Counter that indicates in which column we're in when searching for the label column
col_num = 0
for col in df.columns:
if 'label' in col or col == label_name:
# Column name corresponding to the label
self.label_column_name = col
# Column number corresponding to the label
self.label_column_num = col_num
break
col_num += 1
# Column numbers corresponding to the features
self.features_columns = (list(range(self.label_column_num))
+ list(range(self.label_column_num + 1, arr.shape[1])))
# Features
self.X = arr[:, self.features_columns]
# Labels
self.y = arr[:, self.label_column]
def __getitem__(self, item):
x_i = self.X[item]
y_i = self.y[item]
return x_i, y_i
def __len__(self):
return len(self.X)
class Time_Series_Dataset(Dataset):
'''A dataset object for time series data, which separates features from
labels, and allows to iteratively load data.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame
Dataframe used to search for the label column.
arr : torch.Tensor or numpy.ndarray, default None
The data itself which will be separated into features and labels,
to the be loaded in this dataset or following dataloader objects.
If it is not specified, the data will be used directly from the
dataframe `df`.
label_name : string, default None
Name of the label column.
id_column : string, default None
Name of the sequence ID column.
ts_column : string, default None
Name of the timestamp column.
If arr is None:
seq_len_dict : dictionary, default None
Dictionary containing the original sequence lengths of the dataframe.
The keys should be the sequence identifiers (the numbers obtained from
the id_column) and the values should be the length of each sequence.
embedding_layer : torch.nn.EmbeddingBag or torch.nn.ModuleList or torch.nn.ModuleDict
PyTorch layer(s) that applies the embedding bag, i.e. calculates the
average embedding based on multiple encoded values.
embed_features : int or list of int or str or list of str or list of list of int
Index (or indices) or name(s) of the categorical column(s) that will be
ran through its (or their) respective embedding layer(s). This feature(s)
is (are) removed from the data tensor after the embedding columns are
added. In case the input data is in a dataframe format, the feature(s)
can be specified by name.
padding_value : numeric, default 999999
Value to use in the padding, to fill the sequences.
'''
def __init__(self, df, arr=None, label_name=None, id_column='subject_id',
ts_column='ts', seq_len_dict=None, embedding_layer=None,
embed_features=None, padding_value=999999):
if arr is None:
self.data_type = 'dataframe'
self.id_column_name = id_column
self.ts_column_name = ts_column
self.padding_value = padding_value
if embedding_layer is not None and embed_features is not None:
self.embedding_layer = embedding_layer
self.embed_features = embed_features
else:
self.data_type = 'tensor'
# Counter that indicates in which column we're in when searching for the label column
col_num = 0
for col in df.columns:
if 'label' in col or col == label_name:
# Column name corresponding to the label
self.label_column_name = col
# Column number corresponding to the label
self.label_column_num = col_num
break
col_num += 1
if self.data_type == 'tensor':
# Column numbers corresponding to the features
self.features_columns_num = (list(range(self.label_column_num))
+ list(range(self.label_column_num + 1, arr.shape[2])))
# Features
self.X = arr[:, :, self.features_columns_num]
# Labels
self.y = arr[:, :, self.label_column_num]
elif self.data_type == 'dataframe':
# Column names corresponding to the features
self.features_columns = list(df.columns)
self.features_columns.remove(self.label_column_name)
# Column numbers corresponding to the features
self.features_columns_num = (list(range(self.label_column_num))
+ list(range(self.label_column_num + 1, len(df.columns))))
# Features
self.X = df[self.features_columns]
# Labels
self.y = df[self.label_column_name]
# List of items (sequences)
self.seq_items = df[id_column].unique()
# Sequence length dictionary
if seq_len_dict is None:
self.seq_len_dict = padding.get_sequence_length_dict(df, id_column=id_column,
ts_column=ts_column)
else:
self.seq_len_dict = seq_len_dict
def __getitem__(self, item):
if self.data_type == 'tensor':
# Get the data
x_t = self.X[item]
y_t = self.y[item]
elif self.data_type == 'dataframe':
# Get the sequence ID
seq = self.seq_items[item]
# Find the indices of the dataframes
idx = self.X.index[self.X[self.id_column_num] == seq]
# Get the data
x_t = self.X.iloc[idx]
y_t = self.y.iloc[idx]
if self.embed_features is not None:
# Run each embedding layer on each respective feature, adding the
# resulting embedding values to the tensor and removing the original,
# categorical encoded columns
x_t = embedding_bag_pipeline(x_t, self.embedding_layer, self.embed_features,
inplace=True)
# Pad the data (both X and y)
df = pd.concat([x_t, y_t], axis=1)
df = padding.dataframe_to_padded_tensor(df, seq_len_dict=self.seq_len_dict,
id_column=self.id_column_name,
ts_column=self.ts_column_name,
padding_value=self.padding_value,
inplace=True)
# Features
x_t = df[:, :-1]
# Labels
y_t = df[:, -1]
return x_t, y_t
def __len__(self):
if self.data_type == 'tensor':
return len(self.X)
elif self.data_type == 'dataframe':
return len(self.seq_items)
class Large_Dataset(Dataset):
'''A generic dataset object for any data type, including large datasets,
which allows to iteratively load data from individual data files, in a
lazy loading way.
Parameters
----------
files_name : string, default None
Core name that is shared by the data files.
process_pipeline : function
Python function that preprocesses data in each data loading
iteration.
id_column : string, default None
Name of the sequence or sample ID column.
initial_analysis : function
Python function that runs in the dataset initialization, which can
be used to retrieve additional parameters.
files_path : string
Name of the directory where the data files are stored.
'''
def __init__(self, files_name, process_pipeline, id_column,
initial_analysis=None, files_path='', **kwargs):
# Load the file names
self.files = glob(f'{files_path}{files_name}_*.ftr')
# Data preprocessing pipeline function
self.process_pipeline = process_pipeline
# Other basic data information
self.id_column_name = id_column
# Add aditional data that the user might have specified
self.__dict__.update(kwargs)
# Initial analysis pipeline to fetch important, context specific information
self.initial_analysis = initial_analysis
if self.initial_analysis is not None:
# Run the initial analysis
self.initial_analysis(self)
def __getitem__(self, item):
# Load a data file
df = pd.read_feather(self.files[item])
# Run the data preprocessing pipeline, which should return the features
# and label tensors
features, labels = self.process_pipeline(self, df)
return features, labels
def __len__(self):
return len(self.files)
Classes
class Large_Dataset (files_name, process_pipeline, id_column, initial_analysis=None, files_path='', **kwargs)
-
A generic dataset object for any data type, including large datasets, which allows to iteratively load data from individual data files, in a lazy loading way.
Parameters
files_name
:string
, defaultNone
- Core name that is shared by the data files.
process_pipeline
:function
- Python function that preprocesses data in each data loading iteration.
id_column
:string
, defaultNone
- Name of the sequence or sample ID column.
initial_analysis
:function
- Python function that runs in the dataset initialization, which can be used to retrieve additional parameters.
files_path
:string
- Name of the directory where the data files are stored.
Expand source code
class Large_Dataset(Dataset): '''A generic dataset object for any data type, including large datasets, which allows to iteratively load data from individual data files, in a lazy loading way. Parameters ---------- files_name : string, default None Core name that is shared by the data files. process_pipeline : function Python function that preprocesses data in each data loading iteration. id_column : string, default None Name of the sequence or sample ID column. initial_analysis : function Python function that runs in the dataset initialization, which can be used to retrieve additional parameters. files_path : string Name of the directory where the data files are stored. ''' def __init__(self, files_name, process_pipeline, id_column, initial_analysis=None, files_path='', **kwargs): # Load the file names self.files = glob(f'{files_path}{files_name}_*.ftr') # Data preprocessing pipeline function self.process_pipeline = process_pipeline # Other basic data information self.id_column_name = id_column # Add aditional data that the user might have specified self.__dict__.update(kwargs) # Initial analysis pipeline to fetch important, context specific information self.initial_analysis = initial_analysis if self.initial_analysis is not None: # Run the initial analysis self.initial_analysis(self) def __getitem__(self, item): # Load a data file df = pd.read_feather(self.files[item]) # Run the data preprocessing pipeline, which should return the features # and label tensors features, labels = self.process_pipeline(self, df) return features, labels def __len__(self): return len(self.files)
Ancestors
- torch.utils.data.dataset.Dataset
class Tabular_Dataset (df, arr, label_name=None)
-
A dataset object for tabular data, which separates features from labels, and allows to iteratively load data.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
- Dataframe used to search for the label column.
arr
:torch.Tensor
ornumpy.ndarray
- The data itself which will be separated into features and labels, to the be loaded in this dataset or following dataloader objects.
label_name
:string
, defaultNone
- Name of the label column.
Expand source code
class Tabular_Dataset(Dataset): '''A dataset object for tabular data, which separates features from labels, and allows to iteratively load data. Parameters ---------- df : pandas.DataFrame or dask.DataFrame Dataframe used to search for the label column. arr : torch.Tensor or numpy.ndarray The data itself which will be separated into features and labels, to the be loaded in this dataset or following dataloader objects. label_name : string, default None Name of the label column. ''' def __init__(self, df, arr, label_name=None): # Counter that indicates in which column we're in when searching for the label column col_num = 0 for col in df.columns: if 'label' in col or col == label_name: # Column name corresponding to the label self.label_column_name = col # Column number corresponding to the label self.label_column_num = col_num break col_num += 1 # Column numbers corresponding to the features self.features_columns = (list(range(self.label_column_num)) + list(range(self.label_column_num + 1, arr.shape[1]))) # Features self.X = arr[:, self.features_columns] # Labels self.y = arr[:, self.label_column] def __getitem__(self, item): x_i = self.X[item] y_i = self.y[item] return x_i, y_i def __len__(self): return len(self.X)
Ancestors
- torch.utils.data.dataset.Dataset
class Time_Series_Dataset (df, arr=None, label_name=None, id_column='subject_id', ts_column='ts', seq_len_dict=None, embedding_layer=None, embed_features=None, padding_value=999999)
-
A dataset object for time series data, which separates features from labels, and allows to iteratively load data.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
- Dataframe used to search for the label column.
arr
:torch.Tensor
ornumpy.ndarray
, defaultNone
- The data itself which will be separated into features and labels,
to the be loaded in this dataset or following dataloader objects.
If it is not specified, the data will be used directly from the
dataframe
df
. label_name
:string
, defaultNone
- Name of the label column.
id_column
:string
, defaultNone
- Name of the sequence ID column.
ts_column
:string
, defaultNone
- Name of the timestamp column.
If arr is None:
seq_len_dict
:dictionary
, defaultNone
- Dictionary containing the original sequence lengths of the dataframe. The keys should be the sequence identifiers (the numbers obtained from the id_column) and the values should be the length of each sequence.
embedding_layer
:torch.nn.EmbeddingBag
ortorch.nn.ModuleList
ortorch.nn.ModuleDict
- PyTorch layer(s) that applies the embedding bag, i.e. calculates the average embedding based on multiple encoded values.
embed_features
:int
orlist
ofint
orstr
orlist
ofstr
orlist
oflist
ofint
- Index (or indices) or name(s) of the categorical column(s) that will be ran through its (or their) respective embedding layer(s). This feature(s) is (are) removed from the data tensor after the embedding columns are added. In case the input data is in a dataframe format, the feature(s) can be specified by name.
padding_value
:numeric
, default999999
- Value to use in the padding, to fill the sequences.
Expand source code
class Time_Series_Dataset(Dataset): '''A dataset object for time series data, which separates features from labels, and allows to iteratively load data. Parameters ---------- df : pandas.DataFrame or dask.DataFrame Dataframe used to search for the label column. arr : torch.Tensor or numpy.ndarray, default None The data itself which will be separated into features and labels, to the be loaded in this dataset or following dataloader objects. If it is not specified, the data will be used directly from the dataframe `df`. label_name : string, default None Name of the label column. id_column : string, default None Name of the sequence ID column. ts_column : string, default None Name of the timestamp column. If arr is None: seq_len_dict : dictionary, default None Dictionary containing the original sequence lengths of the dataframe. The keys should be the sequence identifiers (the numbers obtained from the id_column) and the values should be the length of each sequence. embedding_layer : torch.nn.EmbeddingBag or torch.nn.ModuleList or torch.nn.ModuleDict PyTorch layer(s) that applies the embedding bag, i.e. calculates the average embedding based on multiple encoded values. embed_features : int or list of int or str or list of str or list of list of int Index (or indices) or name(s) of the categorical column(s) that will be ran through its (or their) respective embedding layer(s). This feature(s) is (are) removed from the data tensor after the embedding columns are added. In case the input data is in a dataframe format, the feature(s) can be specified by name. padding_value : numeric, default 999999 Value to use in the padding, to fill the sequences. ''' def __init__(self, df, arr=None, label_name=None, id_column='subject_id', ts_column='ts', seq_len_dict=None, embedding_layer=None, embed_features=None, padding_value=999999): if arr is None: self.data_type = 'dataframe' self.id_column_name = id_column self.ts_column_name = ts_column self.padding_value = padding_value if embedding_layer is not None and embed_features is not None: self.embedding_layer = embedding_layer self.embed_features = embed_features else: self.data_type = 'tensor' # Counter that indicates in which column we're in when searching for the label column col_num = 0 for col in df.columns: if 'label' in col or col == label_name: # Column name corresponding to the label self.label_column_name = col # Column number corresponding to the label self.label_column_num = col_num break col_num += 1 if self.data_type == 'tensor': # Column numbers corresponding to the features self.features_columns_num = (list(range(self.label_column_num)) + list(range(self.label_column_num + 1, arr.shape[2]))) # Features self.X = arr[:, :, self.features_columns_num] # Labels self.y = arr[:, :, self.label_column_num] elif self.data_type == 'dataframe': # Column names corresponding to the features self.features_columns = list(df.columns) self.features_columns.remove(self.label_column_name) # Column numbers corresponding to the features self.features_columns_num = (list(range(self.label_column_num)) + list(range(self.label_column_num + 1, len(df.columns)))) # Features self.X = df[self.features_columns] # Labels self.y = df[self.label_column_name] # List of items (sequences) self.seq_items = df[id_column].unique() # Sequence length dictionary if seq_len_dict is None: self.seq_len_dict = padding.get_sequence_length_dict(df, id_column=id_column, ts_column=ts_column) else: self.seq_len_dict = seq_len_dict def __getitem__(self, item): if self.data_type == 'tensor': # Get the data x_t = self.X[item] y_t = self.y[item] elif self.data_type == 'dataframe': # Get the sequence ID seq = self.seq_items[item] # Find the indices of the dataframes idx = self.X.index[self.X[self.id_column_num] == seq] # Get the data x_t = self.X.iloc[idx] y_t = self.y.iloc[idx] if self.embed_features is not None: # Run each embedding layer on each respective feature, adding the # resulting embedding values to the tensor and removing the original, # categorical encoded columns x_t = embedding_bag_pipeline(x_t, self.embedding_layer, self.embed_features, inplace=True) # Pad the data (both X and y) df = pd.concat([x_t, y_t], axis=1) df = padding.dataframe_to_padded_tensor(df, seq_len_dict=self.seq_len_dict, id_column=self.id_column_name, ts_column=self.ts_column_name, padding_value=self.padding_value, inplace=True) # Features x_t = df[:, :-1] # Labels y_t = df[:, -1] return x_t, y_t def __len__(self): if self.data_type == 'tensor': return len(self.X) elif self.data_type == 'dataframe': return len(self.seq_items)
Ancestors
- torch.utils.data.dataset.Dataset