Package data_utils

Expand source code
__version__ = '0.1.0'

from comet_ml import Experiment                         # Comet.ml can log training metrics, parameters, do version control and parameter optimization
import torch                                            # PyTorch to create and apply deep learning models
import numpy as np                                      # NumPy to handle numeric and NaN operations
from importlib import reload                            # Allows to reload (import again) modules, which make them rerun their initialization
import random

# Random seed used in PyTorch and NumPy's random operations (such as weight initialization)
# Automatic seed
random_seed = np.random.get_state()
random.seed(random_seed[1][0])
np.random.set_state(random_seed)
torch.manual_seed(random_seed[1][0])
torch.cuda.manual_seed_all(random_seed[1][0])
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# Boolean that sets whether to use the original Pandas library or the Modin distributed version
use_modin = False

from . import utils                                     # Generic and useful methods
from . import datasets                                  # PyTorch dataset classes
from . import search_explore                            # Methods to search and explore data
from . import visualization                             # Visualization and plotting tools
from . import data_processing                           # Data processing and dataframe operations
from . import padding                                   # Padding and variable sequence length related methods
from . import embedding                                 # Embeddings and other categorical features handling methods
from . import deep_learning                             # Common and generic deep learning related methods
from . import machine_learning                          # Machine learning focused pipeline methods

# Methods

def set_random_seed(num):
    '''Set a user specified seed to use in stochastic (i.e. random) processes.
    This method should be called before importing packages which use a
    random seed.

    Parameters
    ----------
    num : int
        Number that will serve as the random seed.

    Returns
    -------
    None
    '''
    global random_seed
    random_seed = num
    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    return


def set_pandas_library(lib='modin'):
    global use_modin
    if lib.lower() == 'modin':
        use_modin = True
    elif lib.lower() == 'pandas':
        use_modin = False
    else:
        raise Exception(f'ERROR: {lib} is an invalid pandas library. Must either use `pandas` or `modin`.')
    # Reload the modules, to update their pandas package
    reload(utils)
    reload(datasets)
    reload(search_explore)
    reload(visualization)
    reload(data_processing)
    reload(padding)
    reload(embedding)
    reload(deep_learning)
    reload(machine_learning)

Sub-modules

data_utils.data_processing
data_utils.datasets
data_utils.deep_learning
data_utils.embedding
data_utils.machine_learning
data_utils.padding
data_utils.search_explore
data_utils.utils
data_utils.visualization

Functions

def set_pandas_library(lib='modin')
Expand source code
def set_pandas_library(lib='modin'):
    global use_modin
    if lib.lower() == 'modin':
        use_modin = True
    elif lib.lower() == 'pandas':
        use_modin = False
    else:
        raise Exception(f'ERROR: {lib} is an invalid pandas library. Must either use `pandas` or `modin`.')
    # Reload the modules, to update their pandas package
    reload(utils)
    reload(datasets)
    reload(search_explore)
    reload(visualization)
    reload(data_processing)
    reload(padding)
    reload(embedding)
    reload(deep_learning)
    reload(machine_learning)
def set_random_seed(num)

Set a user specified seed to use in stochastic (i.e. random) processes. This method should be called before importing packages which use a random seed.

Parameters

num : int
Number that will serve as the random seed.

Returns

None
 
Expand source code
def set_random_seed(num):
    '''Set a user specified seed to use in stochastic (i.e. random) processes.
    This method should be called before importing packages which use a
    random seed.

    Parameters
    ----------
    num : int
        Number that will serve as the random seed.

    Returns
    -------
    None
    '''
    global random_seed
    random_seed = num
    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    return