Module data_utils.utils
Expand source code
from comet_ml import Experiment # Comet.ml can log training metrics, parameters, do version control and parameter optimization
import torch # PyTorch to create and apply deep learning models
import numpy as np # NumPy to handle numeric and NaN operations
from tqdm.auto import tqdm # tqdm allows to track code execution progress
import numbers # numbers allows to check if data is numeric
import warnings # Print warnings for bad practices
import itertools # Flatten lists
import data_utils as du
# Pandas to handle the data in dataframes
if du.use_modin is True:
import modin.pandas as pd
else:
import pandas as pd
# Methods
def apply_dict_convertion(x, conv_dict, nan_value=0):
'''Safely apply a convertion through a dictionary.
Parameters
----------
x : anything
Object that will be converted through the dictionary.
conv_dict : dict
Dictionary used to convert the input object.
nan_value: anything
Value or object that repressents missingness.
Returns
-------
x : anything
Converted object.
'''
# Check if it's a missing value (NaN)
if isinstance(x, numbers.Number):
if is_num_nan(x):
return nan_value
# Must be a convertable value
else:
return conv_dict[x]
def invert_dict(x):
'''Invert a dictionary, switching its keys with its values.
Parameters
----------
x : dict
Dictionary to be inverted
Returns
-------
x : dict:
Inverted dictionary
'''
return {v: k for k, v in x.items()}
def reverse(data):
'''Reverse the order of a tensor or list.
Parameters
----------
data : torch.Tensor or list
PyTorch tensor or list to revert.
Returns
-------
data : torch.Tensor or list
Reversed tensor or list.
'''
return data[::-1]
def replace_dict_strings(dct, str_to_replace='0', new_str='_', replace_keys=True,
replace_vals=True, inplace=False):
'''Replace strings in a dictionary, in keys and/or values, with a new,
desired string.
Parameters
----------
dct : dict
Dictionary that will have its keys and/or values modified.
str_to_replace : str, default '0'
String to replace with a new one.
new_str : str, default '_'
String to replace the old one.
replace_keys : bool, default True
If set to True, the dictionary's keys will have their strings edited
according to the string replacement set by the user.
replace_values : bool, default True
If set to True, the dictionary's values will have their strings edited
according to the string replacement set by the user.
inplace : bool, default False
If set to True, the original dictionary will be used and modified
directly. Otherwise, a copy will be created and returned, without
changing the original dictionary.
Returns
-------
data_dct : dict:
Inverted dictionary
'''
if not inplace:
# Make a copy of the data to avoid potentially unwanted changes to the original dataframe
data_dct = dct.copy()
else:
# Use the original dataframes
data_dct = dct
if replace_keys is True:
for key in dct.keys():
# Replace undesired string with the new one
new_key = str(key).replace(str_to_replace, new_str)
if new_key != key:
# Remove the old key and replace with the new one
dct[new_key] = dct.pop(key)
if replace_vals is True:
for key, val in dct.items():
# Replace undesired string with the new one
new_val = str(val).replace(str_to_replace, new_str)
if new_val != val:
# Replace the old value with the new one, in the same key
dct[key] = new_val
return data_dct
def merge_dicts(dict1, dict2=None):
'''Merge two or more dictionaries into one. The second dictionary can
overwrite the first one if there are overlapping keys.
Parameters
----------
dict1 : dict or list of dicts
Dictionary 1 that will be merged with dictionary 2 or list of
dictionaries that will be merged.
dict2 : dict, default None
Dictionary 2 that will be merged with dictionary 1. If not specified,
the user must define a list of dictionaries in parameter `dict1` to merge.
Returns
-------
dict3 : dict
New dictionary resulting from the merge.
'''
if isinstance(dict1, dict):
if dict2 is not None:
if isinstance(dict2, dict):
# Find if there are any overlapping keys
dict1_keys = set(dict1.keys())
dict2_keys = set(dict2.keys())
overlap_keys = dict1_keys.intersection(dict2_keys)
for key in overlap_keys:
if ((isinstance(dict1[key], list) or isinstance(dict1[key], set))
or (isinstance(dict2[key], list) or isinstance(dict2[key], set))):
# Merge the lists
dict1[key] = set(dict1[key]) | set(dict2[key])
if isinstance(dict2[key], list):
dict1[key] = list(dict1[key])
dict2[key] = dict1[key]
else:
warnings.warn(f'Found an overlapping key {key} when merging two dictionaries which, as it doesn\'t point to a list or a set, can\'t be merged. As such, the value from the dictionary on the right will be kept.')
# Merge the two input dictionaries
return {**dict1, **dict2}
else:
raise Exception(f'ERROR: When `dict1` is specified as a single dictionary, the second argument `dict2` must also be a dictionary. Instead, received `dict2` of type {type(dict2)}.')
else:
raise Exception(f'ERROR: When `dict1` is specified as a single dictionary, the second argument `dict2` must also be set.')
elif isinstance(dict1, list) and dict2 is None:
# Initialize the new dictionary with the first one on the list
new_dict = dict1[0]
for i in range(len(dict1)):
try:
dict2 = dict1[i+1]
# Find if there are any overlapping keys
new_dict_keys = set(new_dict.keys())
dict2_keys = set(dict2.keys())
overlap_keys = new_dict_keys.intersection(dict2_keys)
for key in overlap_keys:
if ((isinstance(new_dict[key], list) or isinstance(new_dict[key], set))
or (isinstance(dict2[key], list) or isinstance(dict2[key], set))):
# Merge the lists
new_dict[key] = set(new_dict[key]) | set(dict2[key])
if isinstance(dict2[key], list):
new_dict[key] = list(new_dict[key])
dict2[key] = new_dict[key]
else:
warnings.warn(f'Found an overlapping key {key} when merging two dictionaries which, as it doesn\'t point to a list or a set, can\'t be merged. As such, the value from the dictionary on the right will be kept.')
# Try to merge with the next dictionary, if there is any
new_dict = {**new_dict, **dict2}
except:
break
return new_dict
else:
return Exception(f'ERROR: The first parameter `dict1` must be set as either a dictionary or a list of dictionaries. Instead, received `dict1` of type {type(dict1)}.')
def merge_lists(lists):
'''Merge two or more lists into one.
Parameters
----------
lists : list of lists
List containing all the lists that we want to merge.
Returns
-------
lists : lists
New list with all the input lists flatten in a single list.
'''
return list(itertools.chain.from_iterable(lists))
def remove_from_list(data, to_remove, update_idx=False):
'''Remove values from a list, with the option to update the remaining values
everytime one is removed.
Parameters
----------
data : list
Data list to update by removing specified values.
to_remove : list or int or float or str
Values to remove from the list.
update_idx : bool, default False
Returns
-------
data : list
Updated data list.
'''
if isinstance(to_remove, int) or isinstance(to_remove, float) or isinstance(to_remove, str):
# Make sure that the values to remove are in a list format, even if it's just one
to_remove = [to_remove]
# Check if we need to update the values, in case they'll be used as indices
update_idx = all([isinstance(val, int) for val in to_remove]) and update_idx is True
for val in to_remove:
data.remove(val)
if update_idx is True:
for i in range(len(data)):
if data[i] > val:
# Update value (which could be an index) to decrease its value
data[i] -= 1
return data
def is_definitely_string(x):
'''Reports if a value is actually a real string or if it has some number in it.
Parameters
----------
x
Any value which will be judged to be either a real string or numeric.
Returns
-------
boolean
Returns a boolean, being it True if it really is a string or False if it's
either numeric data or a string with a number inside.
'''
if isinstance(x, int) or isinstance(x, float):
return False
try:
float(x)
return False
except Exception:
return isinstance(x, str)
def is_num_nan(x):
'''Indicates if a number corresponds to a missing value.
Parameters
----------
x : int or float or string
A numeric value that will be compared with possible missing value
representations.
Returns
-------
boolean
Returns a boolean, being it True if the number corresponds to a missing
value representation or False if it doesn't.
'''
str_val = str(x).lower()
if str_val == 'nan' or str_val == '<na>':
return True
else:
return False
def is_integer(x):
'''Indicates if a number is an integer.
Parameters
----------
x : int or float or string
A numeric value that will be checked if it's an integer.
Returns
-------
boolean
Returns a boolean, being it True if the number corresponds to an integer
or False if it doesn't.
'''
try:
float(x)
except ValueError:
return False
return float(x).is_integer()
def is_string_nan(x, specific_nan_strings=[]):
'''Indicates if a string corresponds to a missing value.
Parameters
----------
x : string
A string that will be compared with possible missing value
representations.
specific_nan_strings : list of strings, default []
Parameter where the user can specify additional strings that
should correspond to missing values.
Returns
-------
boolean
Returns a boolean, being it True if the string corresponds to a missing
value representation or False if it doesn't.
'''
# Only considering strings for the missing values search
if isinstance(x, str):
# Considering the possibility of just 3 more random extra characters
# in NaN-like strings
if (('other' in x.lower() and len(x) < 9)
or ('null' in x.lower() and len(x) < 7)
or (x.lower() == 'nan')
or ('discrepancy' in x.lower() and len(x) < 14)
or all([char == ' ' for char in x])
or all([char == '_' for char in x])
or all([char == '.' for char in x])
or ('unknown' in x.lower())
or ('not obtainable' in x.lower())
or ('not obtained' in x.lower())
or ('not applicable' in x.lower())
or ('not available' in x.lower())
or ('not evaluated' in x.lower())
or (x in specific_nan_strings)):
return True
else:
return False
else:
warnings.warn(f'Found a non string value of type {type(x)}. As we\'re \
expecting a string, any other format will be considered \
a missing value.')
return True
def get_full_number_string(x, decimal_digits=0):
'''Gets a full number's representation in a string.
Particularly useful when one has very large float values,
possibly too big to be represented as an integer.
Parameters
----------
x : float or double or int
A numeric value that one wants to represent in a string,
with all it's numbers visible.
decimal_digits : int, default 0
Number of decimal digits to account for in the number.
Considering the value as a natural number, without
decimals, by default.
Returns
-------
x : string
A numeric value that one wants to represent in a string,
with all it's numbers visible.
'''
return f'{x:.{decimal_digits}f}'
def in_ipynb():
'''Detect if code is running in a IPython notebook, such as in Jupyter Lab.'''
try:
return str(type(get_ipython())) == "<class 'ipykernel.zmqshell.ZMQInteractiveShell'>"
except Exception:
# Not on IPython if get_ipython fails
return False
def iterations_loop(x, see_progress=True, desc=None, leave=True):
'''Determine if a progress bar is shown or not.'''
if see_progress is True:
# Use a progress bar
return tqdm(x, desc=desc, leave=leave)
else:
# Don't show any progress bar if see_progress is False
return x
def convert_dtypes(df, dtypes=None, inplace=False):
'''Converts a dataframe's data types to the desired ones.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame or modin.pandas.DataFrame
Original dataframe which will be converted.
dtypes : dict, default None
Dictionary that indicates the desired dtype for each column.
e.g. {'Var1': 'float64', 'Var2': 'UInt8', 'Var3': str}
inplace : bool, default False
If set to True, the original dataframe will be used and modified
directly. Otherwise, a copy will be created and returned, without
changing the original dataframe.
Returns
-------
df : pandas.DataFrame or dask.DataFrame or modin.pandas.dataframe.DataFrame
Converted dataframe, in the desired data type.
'''
if not inplace:
# Make a copy of the data to avoid potentially unwanted changes to the original dataframe
data_df = df.copy()
else:
# Use the original dataframes
data_df = df
# Only use the dictionary keys that correspond to column names in the current dataframe
dtype_dict = dict()
df_columns = list(data_df.columns)
for key, val in dtypes.items():
if key in df_columns:
dtype_dict[key] = dtypes[key]
elif key.lower() in df_columns:
dtype_dict[key.lower()] = dtypes[key]
try:
# Set the desired dtypes
data_df = data_df.astype(dtype_dict)
except:
warnings.warn('Failed to assign the desired data types. Replacing all <NA> values with Numpy NaN and trying again.')
# Replace the '<NA>' objects with NumPy's NaN
data_df = data_df.applymap(lambda x: x if not is_num_nan(x) else np.nan)
# Set the desired dtypes
data_df = data_df.astype(dtype_dict)
return data_df
def convert_dataframe(df, to='pandas', return_library=True, dtypes=None):
'''Converts a dataframe to the desired dataframe library format.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame or modin.pandas.DataFrame
Original dataframe which will be converted.
to : string, default 'pandas'
The data library to which format the dataframe will be converted to.
return_library : bool, default True
If set to True, the new dataframe library is also returned as an output.
dtypes : dict, default None
Dictionary that indicates the desired dtype for each column.
e.g. {'Var1': 'float64', 'Var2': 'UInt8', 'Var3': str}
Returns
-------
df : pandas.DataFrame or dask.DataFrame or modin.pandas.dataframe.DataFrame
Converted dataframe, in the desired type.
If return_library == True:
new_pd : pandas or modin.pandas
The dataframe library to which the input dataframe is converted to.
'''
lib = str(to).lower()
if lib == 'pandas':
import pandas as new_pd
elif lib == 'modin':
import modin.pandas as new_pd
else:
raise Exception(f'ERROR: Currently, convertion to a dataframe of type {to} is not supported. Availabale options are "pandas" and "modin".')
converted_df = new_pd.DataFrame(data=df.to_numpy(), columns=df.columns)
du.set_pandas_library(lib)
if dtypes is None:
# Infer adequate dtypes for the dataframe's columns
converted_df = converted_df.infer_objects()
else:
# Set the desired dtypes
converted_df = convert_dtypes(converted_df, dtypes=dtypes, inplace=True)
if return_library is True:
return converted_df, new_pd
else:
return converted_df
def convert_pyarrow_dtypes(df, inplace=False):
'''Converts a dataframe's data types to a pyarrow supported version.
Parameters
----------
df : pandas.DataFrame or dask.DataFrame or modin.pandas.DataFrame
Original dataframe which will have its data types converted.
inplace : bool, default False
If set to True, the original dataframe will be used and modified
directly. Otherwise, a copy will be created and returned, without
changing the original dataframe.
Returns
-------
df : pandas.DataFrame or dask.DataFrame or modin.pandas.dataframe.DataFrame
Converted dataframe, in pyarrow compatible data types.
'''
if not inplace:
# Make a copy of the data to avoid potentially unwanted changes to the original dataframe
data_df = df.copy()
else:
# Use the original dataframes
data_df = df
# Create a columns data type dictionary
dtype_dict = dict(data_df.dtypes)
# Replace the pyarrow incompatible data types with similar, compatible ones
for key, val in dtype_dict.items():
val = str(val)
if (val == 'UInt8' or val == 'UInt16' or val == 'UInt32'
or val == 'Int8' or val == 'Int16' or val == 'Int32'
or val == 'boolean'):
dtype_dict[key] = 'float32'
elif val == 'UInt64' or val == 'Int64':
dtype_dict[key] = 'float64'
elif val == 'string':
dtype_dict[key] = str
# Apply the new data types
data_df = data_df.astype(dtype_dict)
return data_df
Functions
def apply_dict_convertion(x, conv_dict, nan_value=0)
-
Safely apply a convertion through a dictionary.
Parameters
x
:anything
- Object that will be converted through the dictionary.
conv_dict
:dict
- Dictionary used to convert the input object.
nan_value
:anything
- Value or object that repressents missingness.
Returns
x
:anything
- Converted object.
Expand source code
def apply_dict_convertion(x, conv_dict, nan_value=0): '''Safely apply a convertion through a dictionary. Parameters ---------- x : anything Object that will be converted through the dictionary. conv_dict : dict Dictionary used to convert the input object. nan_value: anything Value or object that repressents missingness. Returns ------- x : anything Converted object. ''' # Check if it's a missing value (NaN) if isinstance(x, numbers.Number): if is_num_nan(x): return nan_value # Must be a convertable value else: return conv_dict[x]
def convert_dataframe(df, to='pandas', return_library=True, dtypes=None)
-
Converts a dataframe to the desired dataframe library format.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
ormodin.pandas.DataFrame
- Original dataframe which will be converted.
to
:string
, default'pandas'
- The data library to which format the dataframe will be converted to.
return_library
:bool
, defaultTrue
- If set to True, the new dataframe library is also returned as an output.
dtypes
:dict
, defaultNone
- Dictionary that indicates the desired dtype for each column. e.g. {'Var1': 'float64', 'Var2': 'UInt8', 'Var3': str}
Returns
df
:pandas.DataFrame
ordask.DataFrame
ormodin.pandas.dataframe.DataFrame
- Converted dataframe, in the desired type.
If return_library == True:
new_pd
:pandas
ormodin.pandas
- The dataframe library to which the input dataframe is converted to.
Expand source code
def convert_dataframe(df, to='pandas', return_library=True, dtypes=None): '''Converts a dataframe to the desired dataframe library format. Parameters ---------- df : pandas.DataFrame or dask.DataFrame or modin.pandas.DataFrame Original dataframe which will be converted. to : string, default 'pandas' The data library to which format the dataframe will be converted to. return_library : bool, default True If set to True, the new dataframe library is also returned as an output. dtypes : dict, default None Dictionary that indicates the desired dtype for each column. e.g. {'Var1': 'float64', 'Var2': 'UInt8', 'Var3': str} Returns ------- df : pandas.DataFrame or dask.DataFrame or modin.pandas.dataframe.DataFrame Converted dataframe, in the desired type. If return_library == True: new_pd : pandas or modin.pandas The dataframe library to which the input dataframe is converted to. ''' lib = str(to).lower() if lib == 'pandas': import pandas as new_pd elif lib == 'modin': import modin.pandas as new_pd else: raise Exception(f'ERROR: Currently, convertion to a dataframe of type {to} is not supported. Availabale options are "pandas" and "modin".') converted_df = new_pd.DataFrame(data=df.to_numpy(), columns=df.columns) du.set_pandas_library(lib) if dtypes is None: # Infer adequate dtypes for the dataframe's columns converted_df = converted_df.infer_objects() else: # Set the desired dtypes converted_df = convert_dtypes(converted_df, dtypes=dtypes, inplace=True) if return_library is True: return converted_df, new_pd else: return converted_df
def convert_dtypes(df, dtypes=None, inplace=False)
-
Converts a dataframe's data types to the desired ones.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
ormodin.pandas.DataFrame
- Original dataframe which will be converted.
dtypes
:dict
, defaultNone
- Dictionary that indicates the desired dtype for each column. e.g. {'Var1': 'float64', 'Var2': 'UInt8', 'Var3': str}
inplace
:bool
, defaultFalse
- If set to True, the original dataframe will be used and modified directly. Otherwise, a copy will be created and returned, without changing the original dataframe.
Returns
df
:pandas.DataFrame
ordask.DataFrame
ormodin.pandas.dataframe.DataFrame
- Converted dataframe, in the desired data type.
Expand source code
def convert_dtypes(df, dtypes=None, inplace=False): '''Converts a dataframe's data types to the desired ones. Parameters ---------- df : pandas.DataFrame or dask.DataFrame or modin.pandas.DataFrame Original dataframe which will be converted. dtypes : dict, default None Dictionary that indicates the desired dtype for each column. e.g. {'Var1': 'float64', 'Var2': 'UInt8', 'Var3': str} inplace : bool, default False If set to True, the original dataframe will be used and modified directly. Otherwise, a copy will be created and returned, without changing the original dataframe. Returns ------- df : pandas.DataFrame or dask.DataFrame or modin.pandas.dataframe.DataFrame Converted dataframe, in the desired data type. ''' if not inplace: # Make a copy of the data to avoid potentially unwanted changes to the original dataframe data_df = df.copy() else: # Use the original dataframes data_df = df # Only use the dictionary keys that correspond to column names in the current dataframe dtype_dict = dict() df_columns = list(data_df.columns) for key, val in dtypes.items(): if key in df_columns: dtype_dict[key] = dtypes[key] elif key.lower() in df_columns: dtype_dict[key.lower()] = dtypes[key] try: # Set the desired dtypes data_df = data_df.astype(dtype_dict) except: warnings.warn('Failed to assign the desired data types. Replacing all <NA> values with Numpy NaN and trying again.') # Replace the '<NA>' objects with NumPy's NaN data_df = data_df.applymap(lambda x: x if not is_num_nan(x) else np.nan) # Set the desired dtypes data_df = data_df.astype(dtype_dict) return data_df
def convert_pyarrow_dtypes(df, inplace=False)
-
Converts a dataframe's data types to a pyarrow supported version.
Parameters
df
:pandas.DataFrame
ordask.DataFrame
ormodin.pandas.DataFrame
- Original dataframe which will have its data types converted.
inplace
:bool
, defaultFalse
- If set to True, the original dataframe will be used and modified directly. Otherwise, a copy will be created and returned, without changing the original dataframe.
Returns
df
:pandas.DataFrame
ordask.DataFrame
ormodin.pandas.dataframe.DataFrame
- Converted dataframe, in pyarrow compatible data types.
Expand source code
def convert_pyarrow_dtypes(df, inplace=False): '''Converts a dataframe's data types to a pyarrow supported version. Parameters ---------- df : pandas.DataFrame or dask.DataFrame or modin.pandas.DataFrame Original dataframe which will have its data types converted. inplace : bool, default False If set to True, the original dataframe will be used and modified directly. Otherwise, a copy will be created and returned, without changing the original dataframe. Returns ------- df : pandas.DataFrame or dask.DataFrame or modin.pandas.dataframe.DataFrame Converted dataframe, in pyarrow compatible data types. ''' if not inplace: # Make a copy of the data to avoid potentially unwanted changes to the original dataframe data_df = df.copy() else: # Use the original dataframes data_df = df # Create a columns data type dictionary dtype_dict = dict(data_df.dtypes) # Replace the pyarrow incompatible data types with similar, compatible ones for key, val in dtype_dict.items(): val = str(val) if (val == 'UInt8' or val == 'UInt16' or val == 'UInt32' or val == 'Int8' or val == 'Int16' or val == 'Int32' or val == 'boolean'): dtype_dict[key] = 'float32' elif val == 'UInt64' or val == 'Int64': dtype_dict[key] = 'float64' elif val == 'string': dtype_dict[key] = str # Apply the new data types data_df = data_df.astype(dtype_dict) return data_df
def get_full_number_string(x, decimal_digits=0)
-
Gets a full number's representation in a string. Particularly useful when one has very large float values, possibly too big to be represented as an integer.
Parameters
x
:float
ordouble
orint
- A numeric value that one wants to represent in a string, with all it's numbers visible.
decimal_digits
:int
, default0
- Number of decimal digits to account for in the number. Considering the value as a natural number, without decimals, by default.
Returns
x
:string
- A numeric value that one wants to represent in a string, with all it's numbers visible.
Expand source code
def get_full_number_string(x, decimal_digits=0): '''Gets a full number's representation in a string. Particularly useful when one has very large float values, possibly too big to be represented as an integer. Parameters ---------- x : float or double or int A numeric value that one wants to represent in a string, with all it's numbers visible. decimal_digits : int, default 0 Number of decimal digits to account for in the number. Considering the value as a natural number, without decimals, by default. Returns ------- x : string A numeric value that one wants to represent in a string, with all it's numbers visible. ''' return f'{x:.{decimal_digits}f}'
def in_ipynb()
-
Detect if code is running in a IPython notebook, such as in Jupyter Lab.
Expand source code
def in_ipynb(): '''Detect if code is running in a IPython notebook, such as in Jupyter Lab.''' try: return str(type(get_ipython())) == "<class 'ipykernel.zmqshell.ZMQInteractiveShell'>" except Exception: # Not on IPython if get_ipython fails return False
def invert_dict(x)
-
Invert a dictionary, switching its keys with its values.
Parameters
x
:dict
- Dictionary to be inverted
Returns
x
:dict:
- Inverted dictionary
Expand source code
def invert_dict(x): '''Invert a dictionary, switching its keys with its values. Parameters ---------- x : dict Dictionary to be inverted Returns ------- x : dict: Inverted dictionary ''' return {v: k for k, v in x.items()}
def is_definitely_string(x)
-
Reports if a value is actually a real string or if it has some number in it.
Parameters
x
- Any value which will be judged to be either a real string or numeric.
Returns
boolean
- Returns a boolean, being it True if it really is a string or False if it's either numeric data or a string with a number inside.
Expand source code
def is_definitely_string(x): '''Reports if a value is actually a real string or if it has some number in it. Parameters ---------- x Any value which will be judged to be either a real string or numeric. Returns ------- boolean Returns a boolean, being it True if it really is a string or False if it's either numeric data or a string with a number inside. ''' if isinstance(x, int) or isinstance(x, float): return False try: float(x) return False except Exception: return isinstance(x, str)
def is_integer(x)
-
Indicates if a number is an integer.
Parameters
x
:int
orfloat
orstring
- A numeric value that will be checked if it's an integer.
Returns
boolean
- Returns a boolean, being it True if the number corresponds to an integer or False if it doesn't.
Expand source code
def is_integer(x): '''Indicates if a number is an integer. Parameters ---------- x : int or float or string A numeric value that will be checked if it's an integer. Returns ------- boolean Returns a boolean, being it True if the number corresponds to an integer or False if it doesn't. ''' try: float(x) except ValueError: return False return float(x).is_integer()
def is_num_nan(x)
-
Indicates if a number corresponds to a missing value.
Parameters
x
:int
orfloat
orstring
- A numeric value that will be compared with possible missing value representations.
Returns
boolean
- Returns a boolean, being it True if the number corresponds to a missing value representation or False if it doesn't.
Expand source code
def is_num_nan(x): '''Indicates if a number corresponds to a missing value. Parameters ---------- x : int or float or string A numeric value that will be compared with possible missing value representations. Returns ------- boolean Returns a boolean, being it True if the number corresponds to a missing value representation or False if it doesn't. ''' str_val = str(x).lower() if str_val == 'nan' or str_val == '<na>': return True else: return False
def is_string_nan(x, specific_nan_strings=[])
-
Indicates if a string corresponds to a missing value.
Parameters
x
:string
- A string that will be compared with possible missing value representations.
specific_nan_strings
:list
ofstrings
, default[]
- Parameter where the user can specify additional strings that should correspond to missing values.
Returns
boolean
- Returns a boolean, being it True if the string corresponds to a missing value representation or False if it doesn't.
Expand source code
def is_string_nan(x, specific_nan_strings=[]): '''Indicates if a string corresponds to a missing value. Parameters ---------- x : string A string that will be compared with possible missing value representations. specific_nan_strings : list of strings, default [] Parameter where the user can specify additional strings that should correspond to missing values. Returns ------- boolean Returns a boolean, being it True if the string corresponds to a missing value representation or False if it doesn't. ''' # Only considering strings for the missing values search if isinstance(x, str): # Considering the possibility of just 3 more random extra characters # in NaN-like strings if (('other' in x.lower() and len(x) < 9) or ('null' in x.lower() and len(x) < 7) or (x.lower() == 'nan') or ('discrepancy' in x.lower() and len(x) < 14) or all([char == ' ' for char in x]) or all([char == '_' for char in x]) or all([char == '.' for char in x]) or ('unknown' in x.lower()) or ('not obtainable' in x.lower()) or ('not obtained' in x.lower()) or ('not applicable' in x.lower()) or ('not available' in x.lower()) or ('not evaluated' in x.lower()) or (x in specific_nan_strings)): return True else: return False else: warnings.warn(f'Found a non string value of type {type(x)}. As we\'re \ expecting a string, any other format will be considered \ a missing value.') return True
def iterations_loop(x, see_progress=True, desc=None, leave=True)
-
Determine if a progress bar is shown or not.
Expand source code
def iterations_loop(x, see_progress=True, desc=None, leave=True): '''Determine if a progress bar is shown or not.''' if see_progress is True: # Use a progress bar return tqdm(x, desc=desc, leave=leave) else: # Don't show any progress bar if see_progress is False return x
def merge_dicts(dict1, dict2=None)
-
Merge two or more dictionaries into one. The second dictionary can overwrite the first one if there are overlapping keys.
Parameters
dict1
:dict
orlist
ofdicts
- Dictionary 1 that will be merged with dictionary 2 or list of dictionaries that will be merged.
dict2
:dict
, defaultNone
- Dictionary 2 that will be merged with dictionary 1. If not specified,
the user must define a list of dictionaries in parameter
dict1
to merge.
Returns
dict3
:dict
- New dictionary resulting from the merge.
Expand source code
def merge_dicts(dict1, dict2=None): '''Merge two or more dictionaries into one. The second dictionary can overwrite the first one if there are overlapping keys. Parameters ---------- dict1 : dict or list of dicts Dictionary 1 that will be merged with dictionary 2 or list of dictionaries that will be merged. dict2 : dict, default None Dictionary 2 that will be merged with dictionary 1. If not specified, the user must define a list of dictionaries in parameter `dict1` to merge. Returns ------- dict3 : dict New dictionary resulting from the merge. ''' if isinstance(dict1, dict): if dict2 is not None: if isinstance(dict2, dict): # Find if there are any overlapping keys dict1_keys = set(dict1.keys()) dict2_keys = set(dict2.keys()) overlap_keys = dict1_keys.intersection(dict2_keys) for key in overlap_keys: if ((isinstance(dict1[key], list) or isinstance(dict1[key], set)) or (isinstance(dict2[key], list) or isinstance(dict2[key], set))): # Merge the lists dict1[key] = set(dict1[key]) | set(dict2[key]) if isinstance(dict2[key], list): dict1[key] = list(dict1[key]) dict2[key] = dict1[key] else: warnings.warn(f'Found an overlapping key {key} when merging two dictionaries which, as it doesn\'t point to a list or a set, can\'t be merged. As such, the value from the dictionary on the right will be kept.') # Merge the two input dictionaries return {**dict1, **dict2} else: raise Exception(f'ERROR: When `dict1` is specified as a single dictionary, the second argument `dict2` must also be a dictionary. Instead, received `dict2` of type {type(dict2)}.') else: raise Exception(f'ERROR: When `dict1` is specified as a single dictionary, the second argument `dict2` must also be set.') elif isinstance(dict1, list) and dict2 is None: # Initialize the new dictionary with the first one on the list new_dict = dict1[0] for i in range(len(dict1)): try: dict2 = dict1[i+1] # Find if there are any overlapping keys new_dict_keys = set(new_dict.keys()) dict2_keys = set(dict2.keys()) overlap_keys = new_dict_keys.intersection(dict2_keys) for key in overlap_keys: if ((isinstance(new_dict[key], list) or isinstance(new_dict[key], set)) or (isinstance(dict2[key], list) or isinstance(dict2[key], set))): # Merge the lists new_dict[key] = set(new_dict[key]) | set(dict2[key]) if isinstance(dict2[key], list): new_dict[key] = list(new_dict[key]) dict2[key] = new_dict[key] else: warnings.warn(f'Found an overlapping key {key} when merging two dictionaries which, as it doesn\'t point to a list or a set, can\'t be merged. As such, the value from the dictionary on the right will be kept.') # Try to merge with the next dictionary, if there is any new_dict = {**new_dict, **dict2} except: break return new_dict else: return Exception(f'ERROR: The first parameter `dict1` must be set as either a dictionary or a list of dictionaries. Instead, received `dict1` of type {type(dict1)}.')
def merge_lists(lists)
-
Merge two or more lists into one.
Parameters
lists
:list
oflists
- List containing all the lists that we want to merge.
Returns
lists
:lists
- New list with all the input lists flatten in a single list.
Expand source code
def merge_lists(lists): '''Merge two or more lists into one. Parameters ---------- lists : list of lists List containing all the lists that we want to merge. Returns ------- lists : lists New list with all the input lists flatten in a single list. ''' return list(itertools.chain.from_iterable(lists))
def remove_from_list(data, to_remove, update_idx=False)
-
Remove values from a list, with the option to update the remaining values everytime one is removed.
Parameters
data
:list
- Data list to update by removing specified values.
to_remove
:list
orint
orfloat
orstr
- Values to remove from the list.
update_idx
:bool
, defaultFalse
Returns
data
:list
- Updated data list.
Expand source code
def remove_from_list(data, to_remove, update_idx=False): '''Remove values from a list, with the option to update the remaining values everytime one is removed. Parameters ---------- data : list Data list to update by removing specified values. to_remove : list or int or float or str Values to remove from the list. update_idx : bool, default False Returns ------- data : list Updated data list. ''' if isinstance(to_remove, int) or isinstance(to_remove, float) or isinstance(to_remove, str): # Make sure that the values to remove are in a list format, even if it's just one to_remove = [to_remove] # Check if we need to update the values, in case they'll be used as indices update_idx = all([isinstance(val, int) for val in to_remove]) and update_idx is True for val in to_remove: data.remove(val) if update_idx is True: for i in range(len(data)): if data[i] > val: # Update value (which could be an index) to decrease its value data[i] -= 1 return data
def replace_dict_strings(dct, str_to_replace='0', new_str='_', replace_keys=True, replace_vals=True, inplace=False)
-
Replace strings in a dictionary, in keys and/or values, with a new, desired string.
Parameters
dct
:dict
- Dictionary that will have its keys and/or values modified.
str_to_replace
:str
, default'0'
- String to replace with a new one.
new_str
:str
, default'_'
- String to replace the old one.
replace_keys
:bool
, defaultTrue
- If set to True, the dictionary's keys will have their strings edited according to the string replacement set by the user.
replace_values
:bool
, defaultTrue
- If set to True, the dictionary's values will have their strings edited according to the string replacement set by the user.
inplace
:bool
, defaultFalse
- If set to True, the original dictionary will be used and modified directly. Otherwise, a copy will be created and returned, without changing the original dictionary.
Returns
data_dct
:dict:
- Inverted dictionary
Expand source code
def replace_dict_strings(dct, str_to_replace='0', new_str='_', replace_keys=True, replace_vals=True, inplace=False): '''Replace strings in a dictionary, in keys and/or values, with a new, desired string. Parameters ---------- dct : dict Dictionary that will have its keys and/or values modified. str_to_replace : str, default '0' String to replace with a new one. new_str : str, default '_' String to replace the old one. replace_keys : bool, default True If set to True, the dictionary's keys will have their strings edited according to the string replacement set by the user. replace_values : bool, default True If set to True, the dictionary's values will have their strings edited according to the string replacement set by the user. inplace : bool, default False If set to True, the original dictionary will be used and modified directly. Otherwise, a copy will be created and returned, without changing the original dictionary. Returns ------- data_dct : dict: Inverted dictionary ''' if not inplace: # Make a copy of the data to avoid potentially unwanted changes to the original dataframe data_dct = dct.copy() else: # Use the original dataframes data_dct = dct if replace_keys is True: for key in dct.keys(): # Replace undesired string with the new one new_key = str(key).replace(str_to_replace, new_str) if new_key != key: # Remove the old key and replace with the new one dct[new_key] = dct.pop(key) if replace_vals is True: for key, val in dct.items(): # Replace undesired string with the new one new_val = str(val).replace(str_to_replace, new_str) if new_val != val: # Replace the old value with the new one, in the same key dct[key] = new_val return data_dct
def reverse(data)
-
Reverse the order of a tensor or list.
Parameters
data
:torch.Tensor
orlist
- PyTorch tensor or list to revert.
Returns
data
:torch.Tensor
orlist
- Reversed tensor or list.
Expand source code
def reverse(data): '''Reverse the order of a tensor or list. Parameters ---------- data : torch.Tensor or list PyTorch tensor or list to revert. Returns ------- data : torch.Tensor or list Reversed tensor or list. ''' return data[::-1]