Source code for antinex_utils.build_scaler_train_and_test_datasets

import pandas as pd
from spylunking.log.setup_logging import build_colorized_logger
from antinex_utils.consts import SUCCESS
from antinex_utils.consts import ERR
from antinex_utils.consts import FAILED
from antinex_utils.consts import NOTRUN
from antinex_utils.build_scaler_dataset_from_records import \
    build_scaler_dataset_from_records
from sklearn.model_selection import train_test_split


log = build_colorized_logger(
    name='build_scaler_train_and_test_datasets')


[docs]def build_scaler_train_and_test_datasets( label, train_features, test_feature, df, test_size, seed, scaler_cast_to_type="float32", min_feature_range=-1, max_feature_range=1): """build_scaler_train_and_test_datasets :param label: log label :param train_features: features to train :param test_feature: target feature name :param df: dataframe to build scalers and test and train datasets :param test_size: percent of test to train rows :param min_feature_range: min scaler range :param max_feature_range: max scaler range """ status = NOTRUN last_step = "not-run" scaled_train_df = None scaled_test_df = None scaled_train_dataset = None scaled_test_dataset = None scaler_train = None scaler_test = None x_train = None y_train = None x_test = None y_test = None data = { "scaled_train_df": scaled_train_df, "scaled_test_df": scaled_test_df, "scaled_train_dataset": scaled_train_dataset, "scaled_test_dataset": scaled_test_dataset, "scaler_train": scaler_train, "scaler_test": scaler_test, "x_train": x_train, "y_train": y_train, "x_test": x_test, "y_test": y_test, "min_range": min_feature_range, "max_range": max_feature_range } res = { "status": status, "err": last_step, "data": data } try: last_step = ("building scalers df.rows={} columns={} " "train_features={} test_feature={}").format( len(df.index), list(df.columns.values), train_features, test_feature) log.info(("{} - {}") .format( label, last_step)) scaled_train_df = df[train_features] scaled_test_df = pd.DataFrame( {test_feature: df[test_feature]}) last_step = ("building scaled TRAIN dataset [{},{}]").format( min_feature_range, max_feature_range) log.info(("{} - {}") .format( label, last_step)) scaled_train_res = \ build_scaler_dataset_from_records( label=label, record_list=scaled_train_df.to_json(), min_feature=min_feature_range, max_feature=max_feature_range, cast_to_type=scaler_cast_to_type) last_step = ("building scaled TEST dataset [{},{}]").format( min_feature_range, max_feature_range) log.info(("{} - {}") .format( label, last_step)) scaled_test_res = \ build_scaler_dataset_from_records( label=label, record_list=scaled_test_df.to_json(), min_feature=min_feature_range, max_feature=max_feature_range, cast_to_type=scaler_cast_to_type) last_step = ("scaled dataset transform " "train_status={} test_status={}").format( scaled_train_res["status"] == SUCCESS, scaled_test_res["status"] == SUCCESS) log.info(("{} - {}") .format( label, last_step)) if scaled_train_res["status"] == SUCCESS \ and scaled_test_res["status"] == SUCCESS: last_step = ("scaled train_rows={} " "test_rows={}").format( len(scaled_train_res["dataset"]), len(scaled_test_res["dataset"])) log.info(("{} - {}") .format( label, last_step)) scaler_train = scaled_train_res["scaler"] scaler_test = scaled_test_res["scaler"] scaled_train_dataset = scaled_train_res["dataset"] scaled_test_dataset = scaled_test_res["dataset"] (x_train, x_test, y_train, y_test) = train_test_split( scaled_train_dataset, scaled_test_dataset, test_size=test_size, random_state=seed) else: last_step = ("failed dataset transform " "train_status={} test_status={}").format( scaled_train_res["status"], scaled_test_res["status"]) log.error(("{} - {}") .format( label, last_step)) status = FAILED res = { "status": status, "err": last_step, "data": data } return res # if built both train and test successfully last_step = ("train_rows={} test_rows={} " "x_train={} x_test={} " "y_train={} y_test={}").format( len(scaled_train_df.index), len(scaled_test_df), len(scaled_train_df.index), len(scaled_test_df.index), len(scaled_train_dataset), len(scaled_test_dataset)) log.info(("{} - {}") .format( label, last_step)) data["scaled_train_df"] = scaled_train_df data["scaled_test_df"] = scaled_test_df data["scaled_train_dataset"] = scaled_train_dataset data["scaled_test_dataset"] = scaled_test_dataset data["scaler_train"] = scaler_train data["scaler_test"] = scaler_test data["x_train"] = x_train data["y_train"] = y_train data["x_test"] = x_test data["y_test"] = y_test status = SUCCESS last_step = "" log.info(("{} - done") .format( label)) except Exception as e: last_step = ("failed during last_step='{}' with ex={} " "building scalers df.rows={} columns={} " "train_features={} test_feature={}").format( last_step, e, len(df.index), list(df.columns.values), train_features, test_feature) log.error(("{} - {}") .format( label, last_step)) status = ERR # if applying scaler to predict rows res = { "status": status, "err": last_step, "data": data } return res
# end of build_scaler_train_and_test_datasets