Module data_utils.search_explore
Expand source code
import dask.dataframe as dd # Dask to handle big data in dataframes
import numpy as np # NumPy to handle numeric and NaN operations
import numbers # numbers allows to check if data is numeric
import warnings # Print warnings for bad practices
from . import utils # Generic and useful methods
import data_utils as du
# Pandas to handle the data in dataframes
if du.use_modin is True:
import modin.pandas as pd
else:
import pandas as pd
# Ignore Dask's 'meta' warning
warnings.filterwarnings("ignore", message="`meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.")
# Methods
def dataframe_missing_values(df, column=None):
'''Returns a dataframe with the percentages of missing values of every column
of the original dataframe.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame
Original dataframe which the user wants to analyze for missing values.
column : string, default None
Optional argument which, if provided, makes the function only return
the percentage of missing values in the specified column.
Returns
-------
missing_value_df : pandas.DataFrame or dask.DataFrame
DataFrame containing the percentages of missing values for each column.
col_percent_missing : float
If the "column" argument is provided, the function only returns a float
corresponfing to the percentage of missing values in the specified column.
'''
if column is None:
columns = df.columns
percent_missing = df.isnull().sum() * 100 / len(df)
if isinstance(df, dd.DataFrame):
# Make sure that the values are computed, in case we're using Dask
percent_missing = percent_missing.compute()
missing_value_df = pd.DataFrame({'column_name': columns,
'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
return missing_value_df
else:
col_percent_missing = df[column].isnull().sum() * 100 / len(df)
return col_percent_missing
def is_boolean_column(df, column, n_unique_values=None):
'''Checks if a given column is one hot encoded.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame
Dataframe that will be used, which contains the specified column.
column : string
Name of the column that will be checked for boolean format.
n_unique_values : int, default None
Number of the column's unique values. If not specified, it will
be automatically calculated.
Returns
-------
bool
Returns true if the column is in boolean format.
Otherwise, returns false.
'''
if str(df[column].dtype) == 'boolean':
return True
if n_unique_values is None:
# Calculate the number of unique values
n_unique_values = df[column].nunique()
if isinstance(df, dd.DataFrame):
# Make sure that the number of unique values are computed, in case we're using Dask
n_unique_values = n_unique_values.compute()
# Check if it only has, at most, 2 possible values
if n_unique_values <= 2:
unique_values = df[column].unique()
if isinstance(df, dd.DataFrame):
# Make sure that the unique values are computed, in case we're using Dask
unique_values = unique_values.compute()
# Remove NaNs from the list of unique values
unique_values = [val for val in unique_values if not utils.is_num_nan(val)]
# Check if the possible values are all numeric
if (all([isinstance(x, numbers.Number) for x in unique_values])
or all([isinstance(x, bool) or isinstance(x, np.bool_) for x in unique_values])):
# Check if the only possible values are 0 and 1 (and ignore NaN's)
unique_values = list(set(np.nan_to_num(unique_values)))
unique_values.sort()
unique_values = [val for val in unique_values if str(val).lower() != 'nan' ]
if ((n_unique_values == 2 and unique_values == [0, 1])
or (n_unique_values == 1 and unique_values == [0])
or (n_unique_values == 1 and unique_values == [1])):
return True
return False
def list_boolean_columns(df, search_by_dtypes=False):
'''Lists the columns in a dataframe which are in a boolean format.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame
Dataframe that will be used checked for one hot encoded columns.
search_by_dtypes : bool, default False
If set to True, the method will only look for boolean columns based on
their data type. This is only reliable if all the columns' data types
have been properly set.
Returns
-------
list of strings
Returns a list of the column names which correspond to one hot encoded columns.
'''
if search_by_dtypes is True:
return [col for col in df.columns if str(df[col].dtype) == 'boolean' or df[col].dtype == 'UInt8']
else:
# Calculate the columns' number of unique values
n_unique_values = df.nunique()
if isinstance(df, dd.DataFrame):
# Make sure that the number of unique values are computed, in case we're using Dask
n_unique_values = n_unique_values.compute()
if n_unique_values.min() > 2:
# If there are no columns with just 2 unique values, then there are no binary columns
return []
else:
return [col for col in df.columns if is_boolean_column(df, col, n_unique_values[col])]
def find_col_idx(df, feature):
'''Find the index that corresponds to a given feature's column number on
a dataframe.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame
Dataframe on which to search for the feature idx
feature : string
Name of the feature whose index we want to find.
Returns
-------
idx : int
Index where the specified feature appears in the dataframe.'''
return df.columns.get_loc(feature)
def find_val_idx(data, value, column=None):
'''Find the index that corresponds to a given unique value in a data tensor.
Parameters
----------
data : torch.Tensor
PyTorch tensor containing the data on which the desired value will
be searched for.
value : numeric
Unique value whose index on the data tensor one wants to find out.
column : int, default None
The number of the column in the data tensor that will be searched.
Returns
-------
idx : int
Index where the specified value appears in the data tensor.'''
if len(data.shape) == 1:
val = (data == value).nonzero().squeeze()
elif column is not None:
if len(data.shape) == 2:
val = (data[:, column] == value).nonzero().squeeze()
elif len(data.shape) == 3:
val = (data[:, :, column] == value).nonzero().squeeze()
else:
raise Exception(f'ERROR: Currently this method only supports up to tree-dimensional data. User submitted data with {len(data.shape)} dimensions.')
else:
raise Exception('ERROR: If multidimensional data is being used, the column to search for must be specified in the `column` parameter.')
if len(val.shape) > 1 or len(val) > 1:
# Convert to a NumPy array
return val.numpy()
else:
if val.shape[0] == 0:
# No results found
return None
else:
# Convert to a Python scalar
return val.item()
def find_subject_idx(data, subject_id, subject_id_col=0):
'''Find the index that corresponds to a given subject in a data tensor.
Parameters
----------
data : torch.Tensor
PyTorch tensor containing the data on which the subject's index will be
searched for.
subject_id : int or string
Unique identifier of the subject whose index on the data tensor one
wants to find out.
subject_id_col : int, default 0
The number of the column in the data tensor that stores the subject
identifiers.
Returns
-------
idx : int
Index where the specified subject appears in the data tensor.'''
return (data[:, 0, subject_id_col] == subject_id).nonzero().item()
def find_seq_len(labels, padding_value=999999):
'''Find the lengths of the sequences based on the padding values present in
a labels tensor.
Parameters
----------
labels : torch.Tensor
PyTorch tensor containing the data on which the subject's index will be
searched for.
padding_value : numeric, default 999999
Value to use in the padding, to fill the sequences.
Returns
-------
seq_lengths : torch.Tensor
List of sequence lengths, relative to the input data.'''
# Find when in each sequence the padding starts
padding_start = ((labels == padding_value).cumsum(dim=1) == 1)
# Detect the sequence lengths
seq_lengths = padding_start.nonzero()[:, 1]
if len(seq_lengths) < labels.shape[0]:
# Assign the maximum sequence length to the sequences that aren't padded
all_zero_seq_len = (padding_start.sum(dim=1) == 0) * labels.shape[1]
all_zero_seq_len[all_zero_seq_len == 0] = seq_lengths
seq_lengths = all_zero_seq_len
return seq_lengths
def find_row_contains_word(df, feature, words):
'''Find if each row in a specified dataframe string feature contains some
word from a list.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame
Dataframe containing the feature on which to run the words search.
feature : string
Name of the feature through which the method will search if strings
contain any of the specified words.
words : list of strings or string
List of the words to search for in the feature's rows.
Returns
-------
row_contains_word : pandas.Series or dask.Series
Boolean series indicating for each row of the dataframe if its specified
feature contains any of the words that the user is looking for.'''
row_contains_word = None
if not df[feature].dtype == 'object':
raise Exception(f'ERROR: The specified feature should have type "object", not type {df[feature].dtype}.')
if isinstance(words, str):
# Make sure that the words are in a list format, even if it's just one word
words = [words]
if any([not isinstance(word, str) for word in words]):
raise Exception('ERROR: All words in the specified words list should be strings.')
if isinstance(df, dd.DataFrame):
row_contains_word = df[feature].apply(lambda row: any([word.lower() in row.lower() for word in words]),
meta=('row', bool))
elif isinstance(df, pd.DataFrame):
row_contains_word = df[feature].apply(lambda row: any([word.lower() in row.lower() for word in words]))
else:
raise Exception(f'ERROR: `df` should either be a Pandas or Dask dataframe, not {type(df)}.')
return row_contains_word
def get_element(x, n, till_the_end=False):
'''Try to get an element from a list. Useful for nagging apply and map
dataframe operations.
Parameters
----------
x : list or numpy.ndarray
List from which to get an element.
n : int
Index of the element from the list that we want to retrieve.
till_the_end : bool, default False
If set to true, all elements from index n until the end of the list will
be fetched. Otherwise, the method only returns the n'th element.
Returns
-------
y : anything
Returns the n'th element of the list or NaN if it's not found.
'''
try:
if till_the_end is True:
return x[n:]
else:
return x[n]
except Exception:
return np.nan
def get_element_from_split(orig_string, n, separator='|', till_the_end=False):
'''Split a string by a specified separator and return the n'th element of
the obtained list of words.
Parameters
----------
orig_string : string
Original string on which to apply the splitting and element retrieval.
n : int
The index of the element to return from the post-split list of words.
separator : string, default '|'
Symbol that concatenates each string's words, which will be used in the
splitting.
till_the_end : bool, default False
If set to true, all elements from index n until the end of the list will
be fetched. Otherwise, the method only returns the n'th element.
Returns
-------
n_element : string
The n'th element from the split string.
'''
# Split the string, by the specified separator, to get the list of all words
split_list = orig_string.split(separator)
# Get the n'th element of the list
n_element = get_element(split_list, n, till_the_end)
if till_the_end is True:
# Rejoin the elements of the list by their separator
n_element = separator.join(n_element)
return n_element
Functions
def dataframe_missing_values(df, column=None)
-
Returns a dataframe with the percentages of missing values of every column of the original dataframe.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
- Original dataframe which the user wants to analyze for missing values.
column
:string
, defaultNone
- Optional argument which, if provided, makes the function only return the percentage of missing values in the specified column.
Returns
missing_value_df
:pandas.DataFrame
ordask.DataFrame
- DataFrame containing the percentages of missing values for each column.
col_percent_missing
:float
- If the "column" argument is provided, the function only returns a float corresponfing to the percentage of missing values in the specified column.
Expand source code
def dataframe_missing_values(df, column=None): '''Returns a dataframe with the percentages of missing values of every column of the original dataframe. Parameters ---------- df : pandas.DataFrame or dask.DataFrame Original dataframe which the user wants to analyze for missing values. column : string, default None Optional argument which, if provided, makes the function only return the percentage of missing values in the specified column. Returns ------- missing_value_df : pandas.DataFrame or dask.DataFrame DataFrame containing the percentages of missing values for each column. col_percent_missing : float If the "column" argument is provided, the function only returns a float corresponfing to the percentage of missing values in the specified column. ''' if column is None: columns = df.columns percent_missing = df.isnull().sum() * 100 / len(df) if isinstance(df, dd.DataFrame): # Make sure that the values are computed, in case we're using Dask percent_missing = percent_missing.compute() missing_value_df = pd.DataFrame({'column_name': columns, 'percent_missing': percent_missing}) missing_value_df.sort_values('percent_missing', inplace=True) return missing_value_df else: col_percent_missing = df[column].isnull().sum() * 100 / len(df) return col_percent_missing
def find_col_idx(df, feature)
-
Find the index that corresponds to a given feature's column number on a dataframe.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
- Dataframe on which to search for the feature idx
feature
:string
- Name of the feature whose index we want to find.
Returns
idx
:int
- Index where the specified feature appears in the dataframe.
Expand source code
def find_col_idx(df, feature): '''Find the index that corresponds to a given feature's column number on a dataframe. Parameters ---------- df : pandas.DataFrame or dask.DataFrame Dataframe on which to search for the feature idx feature : string Name of the feature whose index we want to find. Returns ------- idx : int Index where the specified feature appears in the dataframe.''' return df.columns.get_loc(feature)
def find_row_contains_word(df, feature, words)
-
Find if each row in a specified dataframe string feature contains some word from a list.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
- Dataframe containing the feature on which to run the words search.
feature
:string
- Name of the feature through which the method will search if strings contain any of the specified words.
words
:list
ofstrings
orstring
- List of the words to search for in the feature's rows.
Returns
row_contains_word
:pandas.Series
ordask.Series
- Boolean series indicating for each row of the dataframe if its specified feature contains any of the words that the user is looking for.
Expand source code
def find_row_contains_word(df, feature, words): '''Find if each row in a specified dataframe string feature contains some word from a list. Parameters ---------- df : pandas.DataFrame or dask.DataFrame Dataframe containing the feature on which to run the words search. feature : string Name of the feature through which the method will search if strings contain any of the specified words. words : list of strings or string List of the words to search for in the feature's rows. Returns ------- row_contains_word : pandas.Series or dask.Series Boolean series indicating for each row of the dataframe if its specified feature contains any of the words that the user is looking for.''' row_contains_word = None if not df[feature].dtype == 'object': raise Exception(f'ERROR: The specified feature should have type "object", not type {df[feature].dtype}.') if isinstance(words, str): # Make sure that the words are in a list format, even if it's just one word words = [words] if any([not isinstance(word, str) for word in words]): raise Exception('ERROR: All words in the specified words list should be strings.') if isinstance(df, dd.DataFrame): row_contains_word = df[feature].apply(lambda row: any([word.lower() in row.lower() for word in words]), meta=('row', bool)) elif isinstance(df, pd.DataFrame): row_contains_word = df[feature].apply(lambda row: any([word.lower() in row.lower() for word in words])) else: raise Exception(f'ERROR: `df` should either be a Pandas or Dask dataframe, not {type(df)}.') return row_contains_word
def find_seq_len(labels, padding_value=999999)
-
Find the lengths of the sequences based on the padding values present in a labels tensor.
Parameters
labels
:torch.Tensor
- PyTorch tensor containing the data on which the subject's index will be searched for.
padding_value
:numeric
, default999999
- Value to use in the padding, to fill the sequences.
Returns
seq_lengths
:torch.Tensor
- List of sequence lengths, relative to the input data.
Expand source code
def find_seq_len(labels, padding_value=999999): '''Find the lengths of the sequences based on the padding values present in a labels tensor. Parameters ---------- labels : torch.Tensor PyTorch tensor containing the data on which the subject's index will be searched for. padding_value : numeric, default 999999 Value to use in the padding, to fill the sequences. Returns ------- seq_lengths : torch.Tensor List of sequence lengths, relative to the input data.''' # Find when in each sequence the padding starts padding_start = ((labels == padding_value).cumsum(dim=1) == 1) # Detect the sequence lengths seq_lengths = padding_start.nonzero()[:, 1] if len(seq_lengths) < labels.shape[0]: # Assign the maximum sequence length to the sequences that aren't padded all_zero_seq_len = (padding_start.sum(dim=1) == 0) * labels.shape[1] all_zero_seq_len[all_zero_seq_len == 0] = seq_lengths seq_lengths = all_zero_seq_len return seq_lengths
def find_subject_idx(data, subject_id, subject_id_col=0)
-
Find the index that corresponds to a given subject in a data tensor.
Parameters
data
:torch.Tensor
- PyTorch tensor containing the data on which the subject's index will be searched for.
subject_id
:int
orstring
- Unique identifier of the subject whose index on the data tensor one wants to find out.
subject_id_col
:int
, default0
- The number of the column in the data tensor that stores the subject identifiers.
Returns
idx
:int
- Index where the specified subject appears in the data tensor.
Expand source code
def find_subject_idx(data, subject_id, subject_id_col=0): '''Find the index that corresponds to a given subject in a data tensor. Parameters ---------- data : torch.Tensor PyTorch tensor containing the data on which the subject's index will be searched for. subject_id : int or string Unique identifier of the subject whose index on the data tensor one wants to find out. subject_id_col : int, default 0 The number of the column in the data tensor that stores the subject identifiers. Returns ------- idx : int Index where the specified subject appears in the data tensor.''' return (data[:, 0, subject_id_col] == subject_id).nonzero().item()
def find_val_idx(data, value, column=None)
-
Find the index that corresponds to a given unique value in a data tensor.
Parameters
data
:torch.Tensor
- PyTorch tensor containing the data on which the desired value will be searched for.
value
:numeric
- Unique value whose index on the data tensor one wants to find out.
column
:int
, defaultNone
- The number of the column in the data tensor that will be searched.
Returns
idx
:int
- Index where the specified value appears in the data tensor.
Expand source code
def find_val_idx(data, value, column=None): '''Find the index that corresponds to a given unique value in a data tensor. Parameters ---------- data : torch.Tensor PyTorch tensor containing the data on which the desired value will be searched for. value : numeric Unique value whose index on the data tensor one wants to find out. column : int, default None The number of the column in the data tensor that will be searched. Returns ------- idx : int Index where the specified value appears in the data tensor.''' if len(data.shape) == 1: val = (data == value).nonzero().squeeze() elif column is not None: if len(data.shape) == 2: val = (data[:, column] == value).nonzero().squeeze() elif len(data.shape) == 3: val = (data[:, :, column] == value).nonzero().squeeze() else: raise Exception(f'ERROR: Currently this method only supports up to tree-dimensional data. User submitted data with {len(data.shape)} dimensions.') else: raise Exception('ERROR: If multidimensional data is being used, the column to search for must be specified in the `column` parameter.') if len(val.shape) > 1 or len(val) > 1: # Convert to a NumPy array return val.numpy() else: if val.shape[0] == 0: # No results found return None else: # Convert to a Python scalar return val.item()
def get_element(x, n, till_the_end=False)
-
Try to get an element from a list. Useful for nagging apply and map dataframe operations.
Parameters
x
:list
ornumpy.ndarray
- List from which to get an element.
n
:int
- Index of the element from the list that we want to retrieve.
till_the_end
:bool
, defaultFalse
- If set to true, all elements from index n until the end of the list will be fetched. Otherwise, the method only returns the n'th element.
Returns
y
:anything
- Returns the n'th element of the list or NaN if it's not found.
Expand source code
def get_element(x, n, till_the_end=False): '''Try to get an element from a list. Useful for nagging apply and map dataframe operations. Parameters ---------- x : list or numpy.ndarray List from which to get an element. n : int Index of the element from the list that we want to retrieve. till_the_end : bool, default False If set to true, all elements from index n until the end of the list will be fetched. Otherwise, the method only returns the n'th element. Returns ------- y : anything Returns the n'th element of the list or NaN if it's not found. ''' try: if till_the_end is True: return x[n:] else: return x[n] except Exception: return np.nan
def get_element_from_split(orig_string, n, separator='|', till_the_end=False)
-
Split a string by a specified separator and return the n'th element of the obtained list of words.
Parameters
orig_string
:string
- Original string on which to apply the splitting and element retrieval.
n
:int
- The index of the element to return from the post-split list of words.
separator
:string
, default'|'
- Symbol that concatenates each string's words, which will be used in the splitting.
till_the_end
:bool
, defaultFalse
- If set to true, all elements from index n until the end of the list will be fetched. Otherwise, the method only returns the n'th element.
Returns
n_element
:string
- The n'th element from the split string.
Expand source code
def get_element_from_split(orig_string, n, separator='|', till_the_end=False): '''Split a string by a specified separator and return the n'th element of the obtained list of words. Parameters ---------- orig_string : string Original string on which to apply the splitting and element retrieval. n : int The index of the element to return from the post-split list of words. separator : string, default '|' Symbol that concatenates each string's words, which will be used in the splitting. till_the_end : bool, default False If set to true, all elements from index n until the end of the list will be fetched. Otherwise, the method only returns the n'th element. Returns ------- n_element : string The n'th element from the split string. ''' # Split the string, by the specified separator, to get the list of all words split_list = orig_string.split(separator) # Get the n'th element of the list n_element = get_element(split_list, n, till_the_end) if till_the_end is True: # Rejoin the elements of the list by their separator n_element = separator.join(n_element) return n_element
def is_boolean_column(df, column, n_unique_values=None)
-
Checks if a given column is one hot encoded.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
- Dataframe that will be used, which contains the specified column.
column
:string
- Name of the column that will be checked for boolean format.
n_unique_values
:int
, defaultNone
- Number of the column's unique values. If not specified, it will be automatically calculated.
Returns
bool
- Returns true if the column is in boolean format. Otherwise, returns false.
Expand source code
def is_boolean_column(df, column, n_unique_values=None): '''Checks if a given column is one hot encoded. Parameters ---------- df : pandas.DataFrame or dask.DataFrame Dataframe that will be used, which contains the specified column. column : string Name of the column that will be checked for boolean format. n_unique_values : int, default None Number of the column's unique values. If not specified, it will be automatically calculated. Returns ------- bool Returns true if the column is in boolean format. Otherwise, returns false. ''' if str(df[column].dtype) == 'boolean': return True if n_unique_values is None: # Calculate the number of unique values n_unique_values = df[column].nunique() if isinstance(df, dd.DataFrame): # Make sure that the number of unique values are computed, in case we're using Dask n_unique_values = n_unique_values.compute() # Check if it only has, at most, 2 possible values if n_unique_values <= 2: unique_values = df[column].unique() if isinstance(df, dd.DataFrame): # Make sure that the unique values are computed, in case we're using Dask unique_values = unique_values.compute() # Remove NaNs from the list of unique values unique_values = [val for val in unique_values if not utils.is_num_nan(val)] # Check if the possible values are all numeric if (all([isinstance(x, numbers.Number) for x in unique_values]) or all([isinstance(x, bool) or isinstance(x, np.bool_) for x in unique_values])): # Check if the only possible values are 0 and 1 (and ignore NaN's) unique_values = list(set(np.nan_to_num(unique_values))) unique_values.sort() unique_values = [val for val in unique_values if str(val).lower() != 'nan' ] if ((n_unique_values == 2 and unique_values == [0, 1]) or (n_unique_values == 1 and unique_values == [0]) or (n_unique_values == 1 and unique_values == [1])): return True return False
def list_boolean_columns(df, search_by_dtypes=False)
-
Lists the columns in a dataframe which are in a boolean format.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
- Dataframe that will be used checked for one hot encoded columns.
search_by_dtypes
:bool
, defaultFalse
- If set to True, the method will only look for boolean columns based on their data type. This is only reliable if all the columns' data types have been properly set.
Returns
list
ofstrings
- Returns a list of the column names which correspond to one hot encoded columns.
Expand source code
def list_boolean_columns(df, search_by_dtypes=False): '''Lists the columns in a dataframe which are in a boolean format. Parameters ---------- df : pandas.DataFrame or dask.DataFrame Dataframe that will be used checked for one hot encoded columns. search_by_dtypes : bool, default False If set to True, the method will only look for boolean columns based on their data type. This is only reliable if all the columns' data types have been properly set. Returns ------- list of strings Returns a list of the column names which correspond to one hot encoded columns. ''' if search_by_dtypes is True: return [col for col in df.columns if str(df[col].dtype) == 'boolean' or df[col].dtype == 'UInt8'] else: # Calculate the columns' number of unique values n_unique_values = df.nunique() if isinstance(df, dd.DataFrame): # Make sure that the number of unique values are computed, in case we're using Dask n_unique_values = n_unique_values.compute() if n_unique_values.min() > 2: # If there are no columns with just 2 unique values, then there are no binary columns return [] else: return [col for col in df.columns if is_boolean_column(df, col, n_unique_values[col])]