In [1]:
!pip install --upgrade pip
!pip install imblearn
!pip install xgboost
import warnings
warnings.filterwarnings('ignore')

import s3fs
import pickle as pkl
import tarfile
import xgboost
import io
import os
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np 
import pandas as pd 
import boto3
import sagemaker
from scipy.stats                       import ks_2samp
from sklearn.metrics                   import confusion_matrix, auc, roc_curve, recall_score,accuracy_score, precision_score, roc_auc_score, precision_recall_curve, average_precision_score, f1_score
from sagemaker.amazon.amazon_estimator import get_image_uri
from time                              import gmtime, strftime, sleep
from sagemaker                         import get_execution_role
from sklearn.model_selection           import train_test_split
from imblearn.over_sampling            import SMOTE
%matplotlib inline
Requirement already satisfied: pip in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (21.0.1)
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
     |████████████████████████████████| 206 kB 17.7 MB/s eta 0:00:01
Requirement already satisfied: joblib>=0.11 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from imbalanced-learn->imblearn) (1.0.1)
Requirement already satisfied: numpy>=1.13.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from imbalanced-learn->imblearn) (1.19.5)
Requirement already satisfied: scipy>=0.19.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from imbalanced-learn->imblearn) (1.5.3)
Requirement already satisfied: scikit-learn>=0.24 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from imbalanced-learn->imblearn) (0.24.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from scikit-learn>=0.24->imbalanced-learn->imblearn) (2.1.0)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.8.0 imblearn-0.0
Collecting xgboost
  Downloading xgboost-1.4.0-py3-none-manylinux2010_x86_64.whl (166.7 MB)
     |████████████████████████████████| 166.7 MB 23 kB/s s eta 0:00:01
Requirement already satisfied: numpy in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from xgboost) (1.19.5)
Requirement already satisfied: scipy in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from xgboost) (1.5.3)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.0
In [2]:
# in this cell is all the import information for the all rest of code.
# all informations about Sagemaker.
sagemaker_session = sagemaker.Session()
region            = boto3.Session().region_name
smclient          = boto3.Session().client('sagemaker')
role              = sagemaker.get_execution_role()
bucket            = sagemaker_session.default_bucket()
In [3]:
# import dataset only
base_raiz = pd.DataFrame(pd.read_csv('TB_TBN_SISCAN_HISTO_MAMA_PACNT.csv',
                                    sep = ';'))
In [4]:
base_raiz.columns
Out[4]:
Index(['CO_SEQ_SISCAN_HISTO_MAMA_PACNT', 'CO_UF_RESIDENCIA',
       'CO_MUN_RESIDENCIA', 'NU_ANO_COMPETENCIA', 'NU_ANO_MES_COMPETENCIA',
       'CO_RACA_COR', 'CO_IDADE_PACIENTE', 'CO_ESCOLARIDADE',
       'CO_INTERVALO_COLETA', 'CO_INTERVALO_EXAME', 'CO_TEMPO_EXAME',
       'TP_RISCO_ELEVADO', 'TP_EXAME_HISTOPATOLOGICO', 'TP_DETECCAO_LESAO',
       'TP_LATERALIDADE_LESAO', 'TP_TAMANHO_LESAO',
       'TP_LINFONODO_AXILAR_PALPAVEL', 'TP_MATER_ENVIA_PROCEDENTE',
       'TP_PROCEDIMENTO_CIRURGICO', 'TP_ADEQUABILIDAD_MATERIAL',
       'ST_MICROCALCIFICACAO', 'TP_LESAO', 'TP_LESAO_CARAT_NEOPL_MALI',
       'TP_GRAU_HISTOLOGICO', 'TP_MARGEM_CIRURGICA', 'CO_PACIENTE', 'SG_SEXO',
       'TP_DIAGNOSTICO_IMAGEM', 'TP_TAMANHO_TUMOR', 'CO_ANO_RESULTADO',
       'TP_LAUDO_HISTOPATOLOGICO', 'SG_UF_RESIDENCIA'],
      dtype='object')
In [5]:
# I make a correlation matrix only for visual help.
corrMatrix = base_raiz.corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()
In [6]:
corr_matrix = base_raiz.corr().abs()
upper       = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop     = [column for column in upper.columns if any(upper[column] >= 0.95)]
to_drop
Out[6]:
['CO_MUN_RESIDENCIA', 'NU_ANO_MES_COMPETENCIA', 'TP_LAUDO_HISTOPATOLOGICO']
In [7]:
# drop time.
# this column I not use on the problem.
base_raiz.drop(columns ={'NU_ANO_COMPETENCIA', 
                         'NU_ANO_MES_COMPETENCIA', 
                         'CO_TEMPO_EXAME', 
                         'TP_EXAME_HISTOPATOLOGICO', 
                         'TP_MATER_ENVIA_PROCEDENTE', 
                         'TP_PROCEDIMENTO_CIRURGICO',
                         'TP_ADEQUABILIDAD_MATERIAL',
                         'CO_ANO_RESULTADO', 
                         'TP_LESAO_CARAT_NEOPL_MALI', 
                         'TP_GRAU_HISTOLOGICO', 
                         'TP_MARGEM_CIRURGICA', 
                         'TP_DIAGNOSTICO_IMAGEM', 
                         'TP_LESAO',
                         'CO_PACIENTE'},
               inplace = True)
base_raiz = base_raiz.select_dtypes(exclude=['object'])
base_raiz.columns
Out[7]:
Index(['CO_SEQ_SISCAN_HISTO_MAMA_PACNT', 'CO_UF_RESIDENCIA',
       'CO_MUN_RESIDENCIA', 'CO_RACA_COR', 'CO_IDADE_PACIENTE',
       'CO_ESCOLARIDADE', 'CO_INTERVALO_COLETA', 'CO_INTERVALO_EXAME',
       'TP_RISCO_ELEVADO', 'TP_DETECCAO_LESAO', 'TP_LATERALIDADE_LESAO',
       'TP_TAMANHO_LESAO', 'TP_LINFONODO_AXILAR_PALPAVEL', 'TP_TAMANHO_TUMOR',
       'TP_LAUDO_HISTOPATOLOGICO'],
      dtype='object')
In [8]:
base_raiz.groupby('TP_LAUDO_HISTOPATOLOGICO').count()
Out[8]:
CO_SEQ_SISCAN_HISTO_MAMA_PACNT CO_UF_RESIDENCIA CO_MUN_RESIDENCIA CO_RACA_COR CO_IDADE_PACIENTE CO_ESCOLARIDADE CO_INTERVALO_COLETA CO_INTERVALO_EXAME TP_RISCO_ELEVADO TP_DETECCAO_LESAO TP_LATERALIDADE_LESAO TP_TAMANHO_LESAO TP_LINFONODO_AXILAR_PALPAVEL TP_TAMANHO_TUMOR
TP_LAUDO_HISTOPATOLOGICO
1 50127 50127 50127 50127 50127 420 50126 50127 50127 50127 50127 50127 50127 50127
2 1922 1922 1922 1922 1922 2 1922 1922 1922 1922 1922 1922 1922 1922
3 665 665 665 665 665 2 665 665 665 665 665 665 665 665
4 76821 76821 76821 76821 76821 223 76819 76821 76821 76821 76821 76821 76821 76821
6 1062 1062 1062 1062 1062 7 1062 1062 1062 1062 1062 1062 1062 1062
In [9]:
#shuffle dataframe
base_raiz = base_raiz.sample(frac = 1)
Y_column  = base_raiz['TP_LAUDO_HISTOPATOLOGICO']
X_columns = base_raiz.drop(columns = 'TP_LAUDO_HISTOPATOLOGICO')

#prepare the test out-of-time base:
X_train_val, X_test, Y_train_val, Y_test = train_test_split(X_columns,
                                                            Y_column, 
                                                            test_size    = 0.15, 
                                                            random_state = 666)
base_treino_val = pd.concat([X_train_val, Y_train_val], axis = 1)
base_treino_val.columns
Out[9]:
Index(['CO_SEQ_SISCAN_HISTO_MAMA_PACNT', 'CO_UF_RESIDENCIA',
       'CO_MUN_RESIDENCIA', 'CO_RACA_COR', 'CO_IDADE_PACIENTE',
       'CO_ESCOLARIDADE', 'CO_INTERVALO_COLETA', 'CO_INTERVALO_EXAME',
       'TP_RISCO_ELEVADO', 'TP_DETECCAO_LESAO', 'TP_LATERALIDADE_LESAO',
       'TP_TAMANHO_LESAO', 'TP_LINFONODO_AXILAR_PALPAVEL', 'TP_TAMANHO_TUMOR',
       'TP_LAUDO_HISTOPATOLOGICO'],
      dtype='object')
In [10]:
X_test.sample(1).to_csv('teste_lambda.csv')

CORRIGE NAN NA BASE DE TREINO & VALIDAÇÃO

In [11]:
column_means    = base_treino_val.mean()
base_treino_val = base_treino_val.fillna(column_means)
base_treino_val.sample(1)
Out[11]:
CO_SEQ_SISCAN_HISTO_MAMA_PACNT CO_UF_RESIDENCIA CO_MUN_RESIDENCIA CO_RACA_COR CO_IDADE_PACIENTE CO_ESCOLARIDADE CO_INTERVALO_COLETA CO_INTERVALO_EXAME TP_RISCO_ELEVADO TP_DETECCAO_LESAO TP_LATERALIDADE_LESAO TP_TAMANHO_LESAO TP_LINFONODO_AXILAR_PALPAVEL TP_TAMANHO_TUMOR TP_LAUDO_HISTOPATOLOGICO
20391 61241 26 260290 1 30 2.802893 1.0 2 2 2 2 3 2 5 4

CORRIGE NAN NA BASE DE TESTE

In [12]:
base_teste              = pd.concat([X_test, Y_test], axis = 1)
base_teste              = base_teste.sample(frac = 1)
column_means            = base_teste.mean()
base_teste              = base_teste.fillna(column_means)
base_teste['flag_raiz'] = np.where(base_teste['TP_LAUDO_HISTOPATOLOGICO'] == 1, 0,
                          np.where(base_teste['TP_LAUDO_HISTOPATOLOGICO'] == 2, 1,
                          np.where(base_teste['TP_LAUDO_HISTOPATOLOGICO'] == 3, 2,
                          np.where(base_teste['TP_LAUDO_HISTOPATOLOGICO'] == 4, 3, 4))))
base_teste.drop(columns = 'TP_LAUDO_HISTOPATOLOGICO',
                inplace = True)

X_test = pd.DataFrame(base_teste.drop(columns = 'flag_raiz'))
Y_test = pd.DataFrame(base_teste['flag_raiz'])
In [13]:
X_test.sample(2)
Out[13]:
CO_SEQ_SISCAN_HISTO_MAMA_PACNT CO_UF_RESIDENCIA CO_MUN_RESIDENCIA CO_RACA_COR CO_IDADE_PACIENTE CO_ESCOLARIDADE CO_INTERVALO_COLETA CO_INTERVALO_EXAME TP_RISCO_ELEVADO TP_DETECCAO_LESAO TP_LATERALIDADE_LESAO TP_TAMANHO_LESAO TP_LINFONODO_AXILAR_PALPAVEL TP_TAMANHO_TUMOR
108077 152425 26 260790 99 29 2.811881 1.0 1 2 1 1 1 2 5
94449 351741 31 312770 99 74 2.811881 1.0 2 1 1 1 2 2 4
In [14]:
Y_test
Out[14]:
flag_raiz
63906 3
49264 0
16897 3
38712 3
125490 3
... ...
39986 3
85524 0
5621 3
96706 3
62064 3

19590 rows × 1 columns

PREPARA BASE DE TREINO E VALIDAÇÃO

In [15]:
base_treino_val.groupby('TP_LAUDO_HISTOPATOLOGICO').count()
Out[15]:
CO_SEQ_SISCAN_HISTO_MAMA_PACNT CO_UF_RESIDENCIA CO_MUN_RESIDENCIA CO_RACA_COR CO_IDADE_PACIENTE CO_ESCOLARIDADE CO_INTERVALO_COLETA CO_INTERVALO_EXAME TP_RISCO_ELEVADO TP_DETECCAO_LESAO TP_LATERALIDADE_LESAO TP_TAMANHO_LESAO TP_LINFONODO_AXILAR_PALPAVEL TP_TAMANHO_TUMOR
TP_LAUDO_HISTOPATOLOGICO
1 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540
2 1613 1613 1613 1613 1613 1613 1613 1613 1613 1613 1613 1613 1613 1613
3 551 551 551 551 551 551 551 551 551 551 551 551 551 551
4 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390
6 913 913 913 913 913 913 913 913 913 913 913 913 913 913
In [16]:
base_treino_val['flag_2'] = np.where(base_treino_val['TP_LAUDO_HISTOPATOLOGICO'] == 2, 1, 0)
base_treino_val['flag_3'] = np.where(base_treino_val['TP_LAUDO_HISTOPATOLOGICO'] == 3, 1, 0)
base_treino_val['flag_6'] = np.where(base_treino_val['TP_LAUDO_HISTOPATOLOGICO'] == 6, 1, 0)
base_treino_val.columns
Out[16]:
Index(['CO_SEQ_SISCAN_HISTO_MAMA_PACNT', 'CO_UF_RESIDENCIA',
       'CO_MUN_RESIDENCIA', 'CO_RACA_COR', 'CO_IDADE_PACIENTE',
       'CO_ESCOLARIDADE', 'CO_INTERVALO_COLETA', 'CO_INTERVALO_EXAME',
       'TP_RISCO_ELEVADO', 'TP_DETECCAO_LESAO', 'TP_LATERALIDADE_LESAO',
       'TP_TAMANHO_LESAO', 'TP_LINFONODO_AXILAR_PALPAVEL', 'TP_TAMANHO_TUMOR',
       'TP_LAUDO_HISTOPATOLOGICO', 'flag_2', 'flag_3', 'flag_6'],
      dtype='object')
In [17]:
from collections import Counter
smote = SMOTE(sampling_strategy = 0.05,
              random_state = 666)
X_raiz, Y_raiz = smote.fit_resample(base_treino_val.drop(columns='flag_2'), base_treino_val['flag_2'])
base_treino_val = X_raiz
base_treino_val.groupby('TP_LAUDO_HISTOPATOLOGICO').count()
Out[17]:
CO_SEQ_SISCAN_HISTO_MAMA_PACNT CO_UF_RESIDENCIA CO_MUN_RESIDENCIA CO_RACA_COR CO_IDADE_PACIENTE CO_ESCOLARIDADE CO_INTERVALO_COLETA CO_INTERVALO_EXAME TP_RISCO_ELEVADO TP_DETECCAO_LESAO TP_LATERALIDADE_LESAO TP_TAMANHO_LESAO TP_LINFONODO_AXILAR_PALPAVEL TP_TAMANHO_TUMOR flag_3 flag_6
TP_LAUDO_HISTOPATOLOGICO
1 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540
2 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469
3 551 551 551 551 551 551 551 551 551 551 551 551 551 551 551 551
4 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390
6 913 913 913 913 913 913 913 913 913 913 913 913 913 913 913 913
In [18]:
from collections import Counter
smote = SMOTE(sampling_strategy = 0.03,
              random_state = 666)
X_raiz, Y_raiz = smote.fit_resample(base_treino_val.drop(columns='flag_3'), base_treino_val['flag_3'])
base_treino_val = X_raiz
base_treino_val.groupby('TP_LAUDO_HISTOPATOLOGICO').count()
Out[18]:
CO_SEQ_SISCAN_HISTO_MAMA_PACNT CO_UF_RESIDENCIA CO_MUN_RESIDENCIA CO_RACA_COR CO_IDADE_PACIENTE CO_ESCOLARIDADE CO_INTERVALO_COLETA CO_INTERVALO_EXAME TP_RISCO_ELEVADO TP_DETECCAO_LESAO TP_LATERALIDADE_LESAO TP_TAMANHO_LESAO TP_LINFONODO_AXILAR_PALPAVEL TP_TAMANHO_TUMOR flag_6
TP_LAUDO_HISTOPATOLOGICO
1 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540
2 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469
3 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429
4 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390
6 913 913 913 913 913 913 913 913 913 913 913 913 913 913 913
In [19]:
from collections import Counter
smote = SMOTE(sampling_strategy = 0.05,
              random_state = 666)
X_raiz, Y_raiz = smote.fit_resample(base_treino_val.drop(columns='flag_6'), base_treino_val['flag_6'])
base_treino_val = X_raiz
base_treino_val.groupby('TP_LAUDO_HISTOPATOLOGICO').count()
Out[19]:
CO_SEQ_SISCAN_HISTO_MAMA_PACNT CO_UF_RESIDENCIA CO_MUN_RESIDENCIA CO_RACA_COR CO_IDADE_PACIENTE CO_ESCOLARIDADE CO_INTERVALO_COLETA CO_INTERVALO_EXAME TP_RISCO_ELEVADO TP_DETECCAO_LESAO TP_LATERALIDADE_LESAO TP_TAMANHO_LESAO TP_LINFONODO_AXILAR_PALPAVEL TP_TAMANHO_TUMOR
TP_LAUDO_HISTOPATOLOGICO
1 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540
2 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469
3 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429
4 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390
6 5841 5841 5841 5841 5841 5841 5841 5841 5841 5841 5841 5841 5841 5841
In [20]:
base_treino_val['flag_raiz'] = np.where(base_treino_val['TP_LAUDO_HISTOPATOLOGICO'] == 1, 0,
                               np.where(base_treino_val['TP_LAUDO_HISTOPATOLOGICO'] == 2, 1,
                               np.where(base_treino_val['TP_LAUDO_HISTOPATOLOGICO'] == 3, 2,
                               np.where(base_treino_val['TP_LAUDO_HISTOPATOLOGICO'] == 4, 3, 4))))
base_treino_val.drop(columns = 'TP_LAUDO_HISTOPATOLOGICO',
               inplace = True)
base_treino_val.groupby('flag_raiz').count()
Out[20]:
CO_SEQ_SISCAN_HISTO_MAMA_PACNT CO_UF_RESIDENCIA CO_MUN_RESIDENCIA CO_RACA_COR CO_IDADE_PACIENTE CO_ESCOLARIDADE CO_INTERVALO_COLETA CO_INTERVALO_EXAME TP_RISCO_ELEVADO TP_DETECCAO_LESAO TP_LATERALIDADE_LESAO TP_TAMANHO_LESAO TP_LINFONODO_AXILAR_PALPAVEL TP_TAMANHO_TUMOR
flag_raiz
0 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540 42540
1 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469 5469
2 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429 3429
3 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390 65390
4 5841 5841 5841 5841 5841 5841 5841 5841 5841 5841 5841 5841 5841 5841
In [21]:
base_treino_val = base_treino_val.sample(frac = 1)
Y_column        = base_treino_val['flag_raiz']
X_columns       = base_treino_val.drop(columns = 'flag_raiz')

X_train, X_val, Y_train, Y_val           = train_test_split(X_columns,
                                                            Y_column, 
                                                            test_size    = 0.3, 
                                                            random_state = 666)
Y_train = pd.DataFrame(Y_train)
X_train = pd.DataFrame(X_train)
Y_val   = pd.DataFrame(Y_val)
X_val   = pd.DataFrame(X_val)

INICIA PROCESSO DE TREINAMENTO

In [24]:
data_directory = 'histo_detection'
prefix_model   = 'model'
prefix_other   = 'predict'
prefix_tuning  = 'SGM - Output Hyper.Tun'

if not os.path.exists(data_directory):
        os.makedirs(data_directory)
        
pd.DataFrame(X_test).to_csv(os.path.join(data_directory, 'test_X.csv'), header=False, index=False)
pd.DataFrame(Y_test).to_csv(os.path.join(data_directory, 'test_Y.csv'), header=False, index=False)

pd.DataFrame(X_train).to_csv(os.path.join(data_directory, 'train_X.csv'), header=False, index=False)
pd.DataFrame(X_val).to_csv(os.path.join(data_directory, 'val_X.csv'), header=False, index=False)

pd.concat([Y_val  , X_val]  , axis=1).to_csv(os.path.join(data_directory, 'valid.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_directory, 'train.csv'), header=False, index=False)
In [40]:
# upload all data to S3
train_loc   = sagemaker_session.upload_data(os.path.join(data_directory, 'train.csv'), key_prefix=prefix_model)  #Y(variavel resposta)
train_X_loc = sagemaker_session.upload_data(os.path.join(data_directory, 'train_X.csv'), key_prefix=prefix_other)#X(features)

valid_loc   = sagemaker_session.upload_data(os.path.join(data_directory, 'valid.csv'), key_prefix=prefix_model)
valid_X_loc = sagemaker_session.upload_data(os.path.join(data_directory, 'val_X.csv'), key_prefix=prefix_other)

test_X_loc  = sagemaker_session.upload_data(os.path.join(data_directory, 'test_X.csv'), key_prefix=prefix_other)
test_Y_loc  = sagemaker_session.upload_data(os.path.join(data_directory, 'test_Y.csv'), key_prefix=prefix_other)
In [43]:
tuning_job_name = 'xgboost-tuningjob-' + strftime("%d-%H-%M-%S", gmtime())
print (tuning_job_name)
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "0.5",
          "MinValue": "0",
          "Name": "eta",
        },
        {
          "MaxValue": "5",
          "MinValue": "0",
          "Name": "gamma",
        },
        {
          "MaxValue": "120",
          "MinValue": "0",
          "Name": "min_child_weight",
        },
        {
          "MaxValue": "1000",
          "MinValue": "0",
          "Name": "alpha",
        },
        {
          "MaxValue": "1",
          "MinValue": "0.5",
          "Name": "subsample",            
        },
        {
          "MaxValue": "1",
          "MinValue": "0.5",
          "Name": "colsample_bylevel",            
        },
        {
          "MaxValue": "1",
          "MinValue": "0.5",
          "Name": "colsample_bytree",            
        },
        {
          "MaxValue": "1000",
          "MinValue": "0",
          "Name": "lambda",            
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth",
        },
        {
          "MaxValue": "4000",
          "MinValue": "100",
          "Name": "num_round",
        },
        {
          "MaxValue": "10",
          "MinValue": "0",
          "Name": "max_delta_step",
        },
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 6,
      "MaxParallelTrainingJobs": 2
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:merror",
      "Type": "Minimize"
    }
  }
xgboost-tuningjob-31-00-43-18
In [44]:
training_image = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='latest')
     
s3_input_train = 's3://{}/{}/train'.format(bucket, prefix_model)
s3_input_validation ='s3://{}/{}/valid'.format(bucket, prefix_model)

training_job_definition = {
    "AlgorithmSpecification": {
      "TrainingImage": training_image,
      "TrainingInputMode": "File"
    },
    "InputDataConfig": [
      {
        "ChannelName": "train",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_input_train
          }
        }
      },
      {
        "ChannelName": "validation",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_input_validation
          }
        }
      }
    ],
    "OutputDataConfig": {
      "S3OutputPath": "s3://{}/{}/output".format(bucket,prefix_tuning)
    },
    "ResourceConfig": {
      "InstanceCount": 1,
      "InstanceType": "ml.m4.xlarge",
      "VolumeSizeInGB": 10
    },
    "RoleArn": role,
    "StaticHyperParameters": {
      "eval_metric": "merror",
      "objective": "multi:softprob",
      "num_class" : '5',
      "seed" : "42"
    },
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 43200
    }
}
The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
In [45]:
smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName = tuning_job_name,
                                           HyperParameterTuningJobConfig = tuning_job_config,
                                           TrainingJobDefinition = training_job_definition)
Out[45]:
{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-1:771330539858:hyper-parameter-tuning-job/xgboost-tuningjob-31-00-43-18',
 'ResponseMetadata': {'RequestId': '7f2ca6de-30d9-43ce-8a09-75ed50b7aba8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '7f2ca6de-30d9-43ce-8a09-75ed50b7aba8',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '130',
   'date': 'Wed, 31 Mar 2021 00:45:34 GMT'},
  'RetryAttempts': 0}}
In [47]:
smclient.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name)['HyperParameterTuningJobStatus']
Out[47]:
'InProgress'
In [22]:
container = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='latest')

s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix_model), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/valid'.format(bucket, prefix_model), content_type='csv')
The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-1a0846a4636c> in <module>
      1 container = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='latest')
      2 
----> 3 s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix_model), content_type='csv')
      4 s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/valid'.format(bucket, prefix_model), content_type='csv')

NameError: name 'prefix_model' is not defined
In [38]:
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix_model),
                                    sagemaker_session=sagemaker_session)
#after run the hiperparameter tunning I catch the best model from AWS Sagemaker View

xgb.set_hyperparameters(max_depth        = 6,
                        alpha            = 1.3590152328600276,
                        max_delta_step   = 6,
                        min_child_weight = 1.8323029919604323,
                        subsample        = 0.9891388904541443,
                        eta              = 0.4258935309521509,
                        gamma            = 0.3573669255797674,
                        num_round        = 27,#194
                        seed             = 666,
                        silent           = 0,
                        num_class        = 5,
                        objective        = 'multi:softprob',
                        eval_metric      = 'merror')

xgb.fit({'train': s3_input_train, 
         'validation': s3_input_validation}) 
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
2021-03-30 01:05:09 Starting - Starting the training job...
2021-03-30 01:05:22 Starting - Launching requested ML instancesProfilerReport-1617066308: InProgress
......
2021-03-30 01:06:35 Starting - Preparing the instances for training.........
2021-03-30 01:08:04 Downloading - Downloading input data...
2021-03-30 01:08:33 Training - Training image download completed. Training in progress.Arguments: train
[2021-03-30:01:08:34:INFO] Running standalone xgboost training.
[2021-03-30:01:08:34:INFO] File size need to be processed in the node: 7.0mb. Available memory size in the node: 8426.58mb
[2021-03-30:01:08:34:INFO] Determined delimiter of CSV input is ','
[01:08:34] S3DistributionType set as FullyReplicated
[01:08:34] 85853x14 matrix with 1201942 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,
[2021-03-30:01:08:34:INFO] Determined delimiter of CSV input is ','
[01:08:34] S3DistributionType set as FullyReplicated
[01:08:34] 36795x14 matrix with 515130 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,
[01:08:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 108 extra nodes, 4 pruned nodes, max_depth=6
[01:08:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=6
[01:08:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 72 extra nodes, 4 pruned nodes, max_depth=6
[01:08:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 112 extra nodes, 2 pruned nodes, max_depth=6
[01:08:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 80 extra nodes, 0 pruned nodes, max_depth=6
[0]#011train-merror:0.29667#011validation-merror:0.294632
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 114 extra nodes, 2 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 4 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 76 extra nodes, 0 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 6 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 84 extra nodes, 0 pruned nodes, max_depth=6
[1]#011train-merror:0.290823#011validation-merror:0.291127
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 108 extra nodes, 8 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 6 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 0 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=6
[2]#011train-merror:0.285779#011validation-merror:0.286805
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 112 extra nodes, 10 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 2 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 68 extra nodes, 2 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=6
[3]#011train-merror:0.28451#011validation-merror:0.286153
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 106 extra nodes, 14 pruned nodes, max_depth=6
[01:08:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 2 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 76 extra nodes, 0 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 108 extra nodes, 6 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=6
[4]#011train-merror:0.282693#011validation-merror:0.285881
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra nodes, 12 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 6 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 4 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 114 extra nodes, 2 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=6
[5]#011train-merror:0.280142#011validation-merror:0.28387
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 90 extra nodes, 10 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 2 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 2 pruned nodes, max_depth=6
[6]#011train-merror:0.278744#011validation-merror:0.281451
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 92 extra nodes, 4 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 112 extra nodes, 4 pruned nodes, max_depth=6
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 4 pruned nodes, max_depth=6
[7]#011train-merror:0.277043#011validation-merror:0.27944
[01:08:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 0 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra nodes, 6 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 8 pruned nodes, max_depth=6
[8]#011train-merror:0.27405#011validation-merror:0.278706
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 84 extra nodes, 0 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 4 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 94 extra nodes, 14 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 6 pruned nodes, max_depth=6
[9]#011train-merror:0.271184#011validation-merror:0.275391
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 90 extra nodes, 4 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 6 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 86 extra nodes, 16 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=6
[10]#011train-merror:0.26974#011validation-merror:0.274494
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 4 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 2 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 2 pruned nodes, max_depth=6
[01:08:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 6 pruned nodes, max_depth=6
[11]#011train-merror:0.267061#011validation-merror:0.27164
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 66 extra nodes, 4 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 4 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 4 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 98 extra nodes, 0 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[12]#011train-merror:0.266153#011validation-merror:0.271015
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 14 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 2 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 80 extra nodes, 2 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 2 pruned nodes, max_depth=6
[13]#011train-merror:0.26486#011validation-merror:0.270037
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 12 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 4 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nodes, 0 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=6
[14]#011train-merror:0.263474#011validation-merror:0.26895
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 2 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 2 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 4 pruned nodes, max_depth=6
[01:08:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra nodes, 12 pruned nodes, max_depth=6
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 6 pruned nodes, max_depth=6
[15]#011train-merror:0.261598#011validation-merror:0.26808
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 8 pruned nodes, max_depth=6
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=6
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 6 pruned nodes, max_depth=6
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 6 pruned nodes, max_depth=6
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 6 pruned nodes, max_depth=6
[16]#011train-merror:0.260562#011validation-merror:0.267754
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 14 pruned nodes, max_depth=6
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 4 pruned nodes, max_depth=6
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 4 pruned nodes, max_depth=6
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 78 extra nodes, 2 pruned nodes, max_depth=6
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 2 pruned nodes, max_depth=6
[17]#011train-merror:0.259187#011validation-merror:0.267047
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 80 extra nodes, 10 pruned nodes, max_depth=6
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=6
[01:08:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 6 pruned nodes, max_depth=6
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 96 extra nodes, 8 pruned nodes, max_depth=6
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=6
[18]#011train-merror:0.258477#011validation-merror:0.266422
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 74 extra nodes, 10 pruned nodes, max_depth=6
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 2 pruned nodes, max_depth=6
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 2 pruned nodes, max_depth=6
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 80 extra nodes, 0 pruned nodes, max_depth=6
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 4 pruned nodes, max_depth=6
[19]#011train-merror:0.256566#011validation-merror:0.26558
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 76 extra nodes, 4 pruned nodes, max_depth=6
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 4 pruned nodes, max_depth=6
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 2 pruned nodes, max_depth=6
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 2 pruned nodes, max_depth=6
[01:08:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 6 pruned nodes, max_depth=6
[20]#011train-merror:0.255052#011validation-merror:0.263541
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 114 extra nodes, 4 pruned nodes, max_depth=6
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 2 pruned nodes, max_depth=6
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 8 pruned nodes, max_depth=6
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 2 pruned nodes, max_depth=6
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 4 pruned nodes, max_depth=6
[21]#011train-merror:0.254109#011validation-merror:0.262536
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 12 pruned nodes, max_depth=6
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 2 pruned nodes, max_depth=6
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 10 pruned nodes, max_depth=6
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 72 extra nodes, 14 pruned nodes, max_depth=6
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 2 pruned nodes, max_depth=6
[22]#011train-merror:0.253655#011validation-merror:0.26221
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 4 pruned nodes, max_depth=6
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=6
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 6 pruned nodes, max_depth=6
[01:08:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 88 extra nodes, 6 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 4 pruned nodes, max_depth=6
[23]#011train-merror:0.25256#011validation-merror:0.260987
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 2 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 4 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 4 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 84 extra nodes, 2 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 2 pruned nodes, max_depth=6
[24]#011train-merror:0.25108#011validation-merror:0.260117
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 4 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 2 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 2 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 86 extra nodes, 0 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=6
[25]#011train-merror:0.250475#011validation-merror:0.259437
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 2 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 12 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 76 extra nodes, 14 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 8 pruned nodes, max_depth=6
[01:08:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 2 pruned nodes, max_depth=6
[26]#011train-merror:0.248914#011validation-merror:0.258079

2021-03-30 01:09:04 Uploading - Uploading generated training model
2021-03-30 01:09:04 Completed - Training job completed
Training seconds: 69
Billable seconds: 69

PREDICTION AND RESULTS

BATCH TRANSFORM

To analyse if the model is good I use an endpoint on batch transform mode in the three datasets:

1) training dataset (endpoint batch transform mode - after put the information of prediction on local directory)

2) validation dataset (endpoint batch transform mode - after put the information of prediction on local directory)

3) test dataset (endpoint batch transform mode - after put the information of prediction on local directory)

In [39]:
xgb_transformer = xgb.transformer(instance_count = 1, 
                                  instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(train_X_loc, 
                          content_type='text/csv', 
                          split_type='Line')
xgb_transformer.wait()
...............................Arguments: serve
[2021-03-30 01:14:36 +0000] [1] [INFO] Starting gunicorn 19.9.0
[2021-03-30 01:14:36 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)
[2021-03-30 01:14:36 +0000] [1] [INFO] Using worker: gevent
[2021-03-30 01:14:36 +0000] [20] [INFO] Booting worker with pid: 20
[2021-03-30 01:14:36 +0000] [21] [INFO] Booting worker with pid: 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 21
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 20
[2021-03-30 01:14:36 +0000] [22] [INFO] Booting worker with pid: 22
[2021-03-30 01:14:36 +0000] [23] [INFO] Booting worker with pid: 23
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 22
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 23
[2021-03-30:01:14:41:INFO] Sniff delimiter as ','
[2021-03-30:01:14:41:INFO] Determined delimiter of CSV input is ','
2021-03-30T01:14:40.905:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD

Arguments: serve
Arguments: serve
[2021-03-30 01:14:36 +0000] [1] [INFO] Starting gunicorn 19.9.0
[2021-03-30 01:14:36 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)
[2021-03-30 01:14:36 +0000] [1] [INFO] Using worker: gevent
[2021-03-30 01:14:36 +0000] [20] [INFO] Booting worker with pid: 20
[2021-03-30 01:14:36 +0000] [21] [INFO] Booting worker with pid: 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 21
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 20
[2021-03-30 01:14:36 +0000] [22] [INFO] Booting worker with pid: 22
[2021-03-30 01:14:36 +0000] [23] [INFO] Booting worker with pid: 23
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 22
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30 01:14:36 +0000] [1] [INFO] Starting gunicorn 19.9.0
[2021-03-30 01:14:36 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)
[2021-03-30 01:14:36 +0000] [1] [INFO] Using worker: gevent
[2021-03-30 01:14:36 +0000] [20] [INFO] Booting worker with pid: 20
[2021-03-30 01:14:36 +0000] [21] [INFO] Booting worker with pid: 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 21
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 20
[2021-03-30 01:14:36 +0000] [22] [INFO] Booting worker with pid: 22
[2021-03-30 01:14:36 +0000] [23] [INFO] Booting worker with pid: 23
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 22
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 23
[2021-03-30:01:14:36:INFO] Model loaded successfully for worker : 23
[2021-03-30:01:14:41:INFO] Sniff delimiter as ','
[2021-03-30:01:14:41:INFO] Determined delimiter of CSV input is ','
[2021-03-30:01:14:41:INFO] Sniff delimiter as ','
[2021-03-30:01:14:41:INFO] Determined delimiter of CSV input is ','
2021-03-30T01:14:40.905:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD
In [40]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_directory
download: s3://sagemaker-us-east-1-771330539858/xgboost-2021-03-30-01-09-33-523/train_X.csv.out to histo_detection/train_X.csv.out
In [41]:
xgb_transformer.transform(valid_X_loc, 
                          content_type='text/csv', 
                          split_type='Line')
xgb_transformer.wait()
.............................Arguments: serve
[2021-03-30 01:21:24 +0000] [1] [INFO] Starting gunicorn 19.9.0
[2021-03-30 01:21:24 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)
[2021-03-30 01:21:24 +0000] [1] [INFO] Using worker: gevent
[2021-03-30 01:21:24 +0000] [20] [INFO] Booting worker with pid: 20
[2021-03-30 01:21:24 +0000] [21] [INFO] Booting worker with pid: 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 20
[2021-03-30 01:21:24 +0000] [22] [INFO] Booting worker with pid: 22
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 22
[2021-03-30 01:21:24 +0000] [23] [INFO] Booting worker with pid: 23
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 23
[2021-03-30:01:21:29:INFO] Sniff delimiter as ','
[2021-03-30:01:21:29:INFO] Determined delimiter of CSV input is ','

2021-03-30T01:21:28.617:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD
Arguments: serve
Arguments: serve
[2021-03-30 01:21:24 +0000] [1] [INFO] Starting gunicorn 19.9.0
[2021-03-30 01:21:24 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)
[2021-03-30 01:21:24 +0000] [1] [INFO] Using worker: gevent
[2021-03-30 01:21:24 +0000] [1] [INFO] Starting gunicorn 19.9.0
[2021-03-30 01:21:24 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)
[2021-03-30 01:21:24 +0000] [1] [INFO] Using worker: gevent
[2021-03-30 01:21:24 +0000] [20] [INFO] Booting worker with pid: 20
[2021-03-30 01:21:24 +0000] [21] [INFO] Booting worker with pid: 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 20
[2021-03-30 01:21:24 +0000] [22] [INFO] Booting worker with pid: 22
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 22
[2021-03-30 01:21:24 +0000] [23] [INFO] Booting worker with pid: 23
[2021-03-30 01:21:24 +0000] [20] [INFO] Booting worker with pid: 20
[2021-03-30 01:21:24 +0000] [21] [INFO] Booting worker with pid: 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 20
[2021-03-30 01:21:24 +0000] [22] [INFO] Booting worker with pid: 22
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 22
[2021-03-30 01:21:24 +0000] [23] [INFO] Booting worker with pid: 23
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 23
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)', 'requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:21:24:INFO] Model loaded successfully for worker : 23
[2021-03-30:01:21:29:INFO] Sniff delimiter as ','
[2021-03-30:01:21:29:INFO] Sniff delimiter as ','
[2021-03-30:01:21:29:INFO] Determined delimiter of CSV input is ','
[2021-03-30:01:21:29:INFO] Determined delimiter of CSV input is ','
2021-03-30T01:21:28.617:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD
In [42]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_directory
download: s3://sagemaker-us-east-1-771330539858/xgboost-2021-03-30-01-16-45-584/val_X.csv.out to histo_detection/val_X.csv.out
In [43]:
xgb_transformer.transform(test_X_loc, 
                          content_type='text/csv', 
                          split_type='Line')
xgb_transformer.wait()
..............................
Arguments: serve
[2021-03-30 01:27:03 +0000] [1] [INFO] Starting gunicorn 19.9.0
[2021-03-30 01:27:03 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)
[2021-03-30 01:27:03 +0000] [1] [INFO] Using worker: gevent
[2021-03-30 01:27:03 +0000] [21] [INFO] Booting worker with pid: 21
[2021-03-30 01:27:04 +0000] [22] [INFO] Booting worker with pid: 22
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 22
[2021-03-30 01:27:04 +0000] [23] [INFO] Booting worker with pid: 23
[2021-03-30 01:27:04 +0000] [24] [INFO] Booting worker with pid: 24
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 23
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 24
[2021-03-30:01:27:08:INFO] Sniff delimiter as ','
[2021-03-30:01:27:08:INFO] Determined delimiter of CSV input is ','
2021-03-30T01:27:08.151:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD
Arguments: serve
Arguments: serve
[2021-03-30 01:27:03 +0000] [1] [INFO] Starting gunicorn 19.9.0
[2021-03-30 01:27:03 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)
[2021-03-30 01:27:03 +0000] [1] [INFO] Using worker: gevent
[2021-03-30 01:27:03 +0000] [1] [INFO] Starting gunicorn 19.9.0
[2021-03-30 01:27:03 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)
[2021-03-30 01:27:03 +0000] [1] [INFO] Using worker: gevent
[2021-03-30 01:27:03 +0000] [21] [INFO] Booting worker with pid: 21
[2021-03-30 01:27:04 +0000] [22] [INFO] Booting worker with pid: 22
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 22
[2021-03-30 01:27:04 +0000] [23] [INFO] Booting worker with pid: 23
[2021-03-30 01:27:04 +0000] [24] [INFO] Booting worker with pid: 24
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 23
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 24
[2021-03-30 01:27:03 +0000] [21] [INFO] Booting worker with pid: 21
[2021-03-30 01:27:04 +0000] [22] [INFO] Booting worker with pid: 22
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 21
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 22
[2021-03-30 01:27:04 +0000] [23] [INFO] Booting worker with pid: 23
[2021-03-30 01:27:04 +0000] [24] [INFO] Booting worker with pid: 24
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 23
/opt/amazon/lib/python3.7/site-packages/gunicorn/workers/ggevent.py:65: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['requests.packages.urllib3.util (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/__init__.py)', 'requests.packages.urllib3.util.ssl_ (/opt/amazon/lib/python3.7/site-packages/requests/packages/urllib3/util/ssl_.py)']. 
  monkey.patch_all(subprocess=True)
[2021-03-30:01:27:04:INFO] Model loaded successfully for worker : 24
[2021-03-30:01:27:08:INFO] Sniff delimiter as ','
[2021-03-30:01:27:08:INFO] Determined delimiter of CSV input is ','
[2021-03-30:01:27:08:INFO] Sniff delimiter as ','
[2021-03-30:01:27:08:INFO] Determined delimiter of CSV input is ','
2021-03-30T01:27:08.151:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD
In [44]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_directory
download: s3://sagemaker-us-east-1-771330539858/xgboost-2021-03-30-01-22-14-909/test_X.csv.out to histo_detection/test_X.csv.out

FINAL PREDICTIONS

In the cell above I make the predictions and add a column for the real class

In [29]:
pred_train         = pd.DataFrame(pd.read_csv(os.path.join(data_directory, 'train_X.csv.out'), header=None))
pred_train[0]      = pred_train[0].map(lambda x: str(x).replace('[',''))
pred_train[4]      = pred_train[4].map(lambda x: str(x).replace(']',''))

pred_val           = pd.DataFrame(pd.read_csv(os.path.join(data_directory, 'val_X.csv.out'), header=None))
pred_val[0]        = pred_val[0].map(lambda x: str(x).replace('[',''))
pred_val[4]        = pred_val[4].map(lambda x: str(x).replace(']',''))


pred_test          = pd.DataFrame(pd.read_csv(os.path.join(data_directory, 'test_X.csv.out'), header=None))
pred_test[0]       = pred_test[0].map(lambda x: str(x).replace('[',''))
pred_test[4]       = pred_test[4].map(lambda x: str(x).replace(']',''))
In [30]:
pred_train              = pred_train.astype(float)
pred_val                = pred_val.astype(float)
pred_test               = pred_test.astype(float)

pred_train['Max']       = pred_train.idxmax(axis = 1)
pred_train.drop(columns = {0, 1, 2, 3, 4}, inplace = True)

pred_val['Max']         = pred_val.idxmax(axis = 1)
pred_val.drop(columns   = {0, 1, 2, 3, 4}, inplace = True)

pred_test['Max']        = pred_test.idxmax(axis = 1)
pred_test.drop(columns  = {0, 1, 2, 3, 4}, inplace = True)
In [31]:
pred_train['True'] = pd.DataFrame(pd.read_csv(os.path.join(data_directory, 
                                                           'train.csv'), 
                                              header=None)[0]).rename(columns = {0 : 'True'})
pred_val['True']   = pd.DataFrame(pd.read_csv(os.path.join(data_directory, 
                                                         'valid.csv'), 
                                            header=None)[0]).rename(columns = {0 : 'True'})
pred_test['True']  = pd.DataFrame(pd.read_csv(os.path.join(data_directory,
                                                          'test_Y.csv'), header = None))
In [32]:
confusion_matrix(pred_train['True'], pred_train['Max'])
Out[32]:
array([[ 9759,   783,   351, 17898,   979],
       [ 1302,    86,    56,  2306,   110],
       [  710,    62,    37,  1465,    91],
       [14942,  1184,   525, 27626,  1511],
       [ 1328,   106,    39,  2454,   143]])
In [33]:
confusion_matrix(pred_val['True'], pred_val['Max'])
Out[33]:
array([[ 4156,   306,   171,  7694,   436],
       [  521,    46,    23,   967,    52],
       [  348,    33,    13,   627,    42],
       [ 6525,   466,   234, 11748,   618],
       [  573,    50,    19,  1073,    54]])
In [34]:
confusion_matrix(pred_test['True'], pred_test['Max'])
Out[34]:
array([[2607,   60,    7, 4856,   57],
       [  97,    4,    0,  205,    3],
       [  40,    0,    0,   73,    1],
       [3971,  102,   16, 7261,   81],
       [  55,    0,    0,   93,    1]])
In [35]:
table_metrics = pd.DataFrame(columns = {'Data',
                                        'Acc'})
table_metrics.at[0, 'Data'] = 'Train'
table_metrics.at[0, 'Acc']  = round(accuracy_score(pred_train['True'], pred_train['Max'])*100,2)

table_metrics.at[1, 'Data'] = 'Valid'
table_metrics.at[1, 'Acc']  = round(accuracy_score(pred_val['True'], pred_val['Max'])*100,2)

table_metrics.at[2, 'Data'] = 'Test'
table_metrics.at[2, 'Acc']  = round(accuracy_score(pred_test['True'], pred_test['Max'])*100,2)
table_metrics
Out[35]:
Acc Data
0 43.86 Train
1 43.53 Valid
2 50.4 Test
In [36]:
pred_train.rename(columns = {'Max'  : 'Laudo Modelo',
                             'True' : 'Laudo Real'},
                 inplace  = True)
pred_val.rename(columns   = {'Max'  : 'Laudo Modelo',
                             'True' : 'Laudo Real'},
                 inplace  = True)
pred_test.rename(columns  = {'Max'  : 'Laudo Modelo',
                             'True' : 'Laudo Real'},
                 inplace  = True)
In [37]:
pred_train.to_csv('base_treino_predicao.csv',
                  sep = ';')
pred_val.to_csv('base_valid_predicao.csv',
                sep = ';')
pred_test.to_csv('base_teste_predicao.csv',
                 sep = ';')

MÉTRICAS

TABELAS COM MÉTRICAS

1) KS

2) ROC

3) F1

4) Recall

5) Precision

6) Accurácia

In [40]:
pred_train         = pd.DataFrame(pd.read_csv(os.path.join(data_directory, 'train_X.csv.out'), header=None))
pred_train[0]      = pred_train[0].map(lambda x: str(x).replace('[',''))
pred_train[4]      = pred_train[4].map(lambda x: str(x).replace(']',''))

pred_val           = pd.DataFrame(pd.read_csv(os.path.join(data_directory, 'val_X.csv.out'), header=None))
pred_val[0]        = pred_val[0].map(lambda x: str(x).replace('[',''))
pred_val[4]        = pred_val[4].map(lambda x: str(x).replace(']',''))


pred_test          = pd.DataFrame(pd.read_csv(os.path.join(data_directory, 'test_X.csv.out'), header=None))
pred_test[0]       = pred_test[0].map(lambda x: str(x).replace('[',''))
pred_test[4]       = pred_test[4].map(lambda x: str(x).replace(']',''))

pred_train['True'] = pd.DataFrame(pd.read_csv(os.path.join(data_directory, 
                                                           'train.csv'), 
                                              header=None)[0]).rename(columns = {0 : 'True'})
pred_val['True']   = pd.DataFrame(pd.read_csv(os.path.join(data_directory, 
                                                         'valid.csv'), 
                                            header=None)[0]).rename(columns = {0 : 'True'})
pred_test['True']  = pd.DataFrame(pd.read_csv(os.path.join(data_directory,
                                                          'test_Y.csv'), header = None))
pred_train.head(5)
Out[40]:
0 1 2 3 4 True
0 0.44944697618484497 0.003712 0.005239 0.526942 0.014658913016319275 3
1 0.11719241738319397 0.007295 0.028363 0.837371 0.009779603220522404 0
2 0.7194904685020447 0.000478 0.000439 0.279143 0.000448645994765684 0
3 0.7211182117462158 0.000424 0.000435 0.277550 0.00047271294170059264 3
4 0.7815982103347778 0.004299 0.008939 0.186499 0.018665609881281853 1
In [ ]:
pred_train['True 0'] = np.where(pred_train['True'] == 0, 1, 0).astype(int)
pred_train['True 1'] = np.where(pred_train['True'] == 1, 1, 0).astype(int)
pred_train['True 2'] = np.where(pred_train['True'] == 2, 1, 0).astype(int)
pred_train['True 3'] = np.where(pred_train['True'] == 3, 1, 0).astype(int)
pred_train['True 4'] = np.where(pred_train['True'] == 4, 1, 0).astype(int)
pred_train[0]        = pred_train[0].astype(float)
pred_train[1]        = pred_train[1].astype(float)
pred_train[2]        = pred_train[2].astype(float)
pred_train[3]        = pred_train[3].astype(float)
pred_train[4]        = pred_train[4].astype(float)
pred_train.drop(columns = 'True', inplace = True)
pred_train.rename(columns = {0 : 'Mod 0',
                             1 : 'Mod 1',
                             2 : 'Mod 2',
                             3 : 'Mod 3',
                             4 : 'Mod 4'},
                 inplace = True)

pred_val['True 0'] = np.where(pred_val['True'] == 0, 1, 0).astype(int)
pred_val['True 1'] = np.where(pred_val['True'] == 1, 1, 0).astype(int)
pred_val['True 2'] = np.where(pred_val['True'] == 2, 1, 0).astype(int)
pred_val['True 3'] = np.where(pred_val['True'] == 3, 1, 0).astype(int)
pred_val['True 4'] = np.where(pred_val['True'] == 4, 1, 0).astype(int)
pred_val[0]        = pred_val[0].astype(float)
pred_val[1]        = pred_val[1].astype(float)
pred_val[2]        = pred_val[2].astype(float)
pred_val[3]        = pred_val[3].astype(float)
pred_val[4]        = pred_val[4].astype(float)
pred_val.drop(columns = 'True', inplace = True)
pred_val.rename(columns = {0 : 'Mod 0',
                             1 : 'Mod 1',
                             2 : 'Mod 2',
                             3 : 'Mod 3',
                             4 : 'Mod 4'},
                 inplace = True)

pred_test['True 0'] = np.where(pred_test['True'] == 0, 1, 0).astype(int)
pred_test['True 1'] = np.where(pred_test['True'] == 1, 1, 0).astype(int)
pred_test['True 2'] = np.where(pred_test['True'] == 2, 1, 0).astype(int)
pred_test['True 3'] = np.where(pred_test['True'] == 3, 1, 0).astype(int)
pred_test['True 4'] = np.where(pred_test['True'] == 4, 1, 0).astype(int)
pred_test[0]        = pred_test[0].astype(float)
pred_test[1]        = pred_test[1].astype(float)
pred_test[2]        = pred_test[2].astype(float)
pred_test[3]        = pred_test[3].astype(float)
pred_test[4]        = pred_test[4].astype(float)
pred_test.drop(columns = 'True', inplace = True)
pred_test.rename(columns = {0 : 'Mod 0',
                             1 : 'Mod 1',
                             2 : 'Mod 2',
                             3 : 'Mod 3',
                             4 : 'Mod 4'},
                 inplace = True)
In [56]:
metricas_train = pd.DataFrame(columns = {'Data',
                                         'ROC',
                                         'Rec',
                                         'Prec',
                                         'Acc',
                                         'KS',
                                         'F1'})
metricas_train.at[0, 'Data'] = 'Treino - Faixa 1'
metricas_train.at[0, 'ROC']  = round(roc_auc_score(np.asarray(pred_train['True 0']), np.asarray(pred_train['Mod 0']))*100, 2)
metricas_train.at[0, 'KS']   = round(ks_2samp(pred_train.loc[pred_train['True 0'] == 0, 'Mod 0'], pred_train.loc[pred_train['True 0'] == 1, 'Mod 0'])[0]*100, 2)
metricas_train.at[0, 'F1']   = round(f1_score(np.asarray(pred_train['True 0']), np.asarray(pred_train['Mod 0'].round()), average = 'binary')*100,2)
metricas_train.at[0, 'Rec']  = round(recall_score(np.asarray(pred_train['True 0']), np.asarray(pred_train['Mod 0'].round()))*100,2)
metricas_train.at[0, 'Prec'] = round(precision_score(np.asarray(pred_train['True 0']), np.asarray(pred_train['Mod 0'].round()))*100,2)
metricas_train.at[0, 'Acc']  = round(accuracy_score(np.asarray(pred_train['True 0']), np.asarray(pred_train['Mod 0'].round()), normalize=True)*100,2)

metricas_train.at[1, 'Data'] = 'Treino - Faixa 2'
metricas_train.at[1, 'ROC']  = round(roc_auc_score(np.asarray(pred_train['True 1']), np.asarray(pred_train['Mod 1']))*100, 2)
metricas_train.at[1, 'KS']   = round(ks_2samp(pred_train.loc[pred_train['True 1'] == 0, 'Mod 1'], pred_train.loc[pred_train['True 1'] == 1, 'Mod 1'])[0]*100, 2)
metricas_train.at[1, 'F1']   = round(f1_score(np.asarray(pred_train['True 1']), np.asarray(pred_train['Mod 1'].round()), average = 'binary')*100,2)
metricas_train.at[1, 'Rec']  = round(recall_score(np.asarray(pred_train['True 1']), np.asarray(pred_train['Mod 1'].round()))*100,2)
metricas_train.at[1, 'Prec'] = round(precision_score(np.asarray(pred_train['True 1']), np.asarray(pred_train['Mod 1'].round()))*100,2)
metricas_train.at[1, 'Acc']  = round(accuracy_score(np.asarray(pred_train['True 1']), np.asarray(pred_train['Mod 1'].round()), normalize=True)*100,2)

metricas_train.at[2, 'Data'] = 'Treino - Faixa 3'
metricas_train.at[2, 'ROC']  = round(roc_auc_score(np.asarray(pred_train['True 2']), np.asarray(pred_train['Mod 2']))*100, 2)
metricas_train.at[2, 'KS']   = round(ks_2samp(pred_train.loc[pred_train['True 2'] == 0, 'Mod 2'], pred_train.loc[pred_train['True 2'] == 1, 'Mod 2'])[0]*100, 2)
metricas_train.at[2, 'F1']   = round(f1_score(np.asarray(pred_train['True 2']), np.asarray(pred_train['Mod 2'].round()), average = 'binary')*100,2)
metricas_train.at[2, 'Rec']  = round(recall_score(np.asarray(pred_train['True 2']), np.asarray(pred_train['Mod 2'].round()))*100,2)
metricas_train.at[2, 'Prec'] = round(precision_score(np.asarray(pred_train['True 2']), np.asarray(pred_train['Mod 2'].round()))*100,2)
metricas_train.at[2, 'Acc']  = round(accuracy_score(np.asarray(pred_train['True 2']), np.asarray(pred_train['Mod 2'].round()), normalize=True)*100,2)

metricas_train.at[3, 'Data'] = 'Treino - Faixa 4'
metricas_train.at[3, 'ROC']  = round(roc_auc_score(np.asarray(pred_train['True 3']), np.asarray(pred_train['Mod 3']))*100, 2)
metricas_train.at[3, 'KS']   = round(ks_2samp(pred_train.loc[pred_train['True 3'] == 0, 'Mod 3'], pred_train.loc[pred_train['True 3'] == 1, 'Mod 3'])[0]*100, 2)
metricas_train.at[3, 'F1']   = round(f1_score(np.asarray(pred_train['True 3']), np.asarray(pred_train['Mod 3'].round()), average = 'binary')*100,2)
metricas_train.at[3, 'Rec']  = round(recall_score(np.asarray(pred_train['True 3']), np.asarray(pred_train['Mod 3'].round()))*100,2)
metricas_train.at[3, 'Prec'] = round(precision_score(np.asarray(pred_train['True 3']), np.asarray(pred_train['Mod 3'].round()))*100,2)
metricas_train.at[3, 'Acc']  = round(accuracy_score(np.asarray(pred_train['True 3']), np.asarray(pred_train['Mod 3'].round()), normalize=True)*100,2)

metricas_train.at[4, 'Data'] = 'Treino - Faixa 6'
metricas_train.at[4, 'ROC']  = round(roc_auc_score(np.asarray(pred_train['True 4']), np.asarray(pred_train['Mod 4']))*100, 2)
metricas_train.at[4, 'KS']   = round(ks_2samp(pred_train.loc[pred_train['True 4'] == 0, 'Mod 4'], pred_train.loc[pred_train['True 4'] == 1, 'Mod 4'])[0]*100, 2)
metricas_train.at[4, 'F1']   = round(f1_score(np.asarray(pred_train['True 4']), np.asarray(pred_train['Mod 4'].round()), average = 'binary')*100,2)
metricas_train.at[4, 'Rec']  = round(recall_score(np.asarray(pred_train['True 4']), np.asarray(pred_train['Mod 4'].round()))*100,2)
metricas_train.at[4, 'Prec'] = round(precision_score(np.asarray(pred_train['True 4']), np.asarray(pred_train['Mod 4'].round()))*100,2)
metricas_train.at[4, 'Acc']  = round(accuracy_score(np.asarray(pred_train['True 4']), np.asarray(pred_train['Mod 4'].round()), normalize=True)*100,2)
In [57]:
metricas_valid = pd.DataFrame(columns = {'Data',
                                         'ROC',
                                         'Rec',
                                         'Prec',
                                         'Acc',
                                         'KS',
                                         'F1'})
metricas_valid.at[0, 'Data'] = 'Valid - Faixa 1'
metricas_valid.at[0, 'ROC']  = round(roc_auc_score(np.asarray(pred_val['True 0']), np.asarray(pred_val['Mod 0']))*100, 2)
metricas_valid.at[0, 'KS']   = round(ks_2samp(pred_val.loc[pred_val['True 0'] == 0, 'Mod 0'], pred_val.loc[pred_val['True 0'] == 1, 'Mod 0'])[0]*100, 2)
metricas_valid.at[0, 'F1']   = round(f1_score(np.asarray(pred_val['True 0']), np.asarray(pred_val['Mod 0'].round()), average = 'binary')*100,2)
metricas_valid.at[0, 'Rec']  = round(recall_score(np.asarray(pred_val['True 0']), np.asarray(pred_val['Mod 0'].round()))*100,2)
metricas_valid.at[0, 'Prec'] = round(precision_score(np.asarray(pred_val['True 0']), np.asarray(pred_val['Mod 0'].round()))*100,2)
metricas_valid.at[0, 'Acc']  = round(accuracy_score(np.asarray(pred_val['True 0']), np.asarray(pred_val['Mod 0'].round()), normalize=True)*100,2)

metricas_valid.at[1, 'Data'] = 'Valid - Faixa 2'
metricas_valid.at[1, 'ROC']  = round(roc_auc_score(np.asarray(pred_val['True 1']), np.asarray(pred_val['Mod 1']))*100, 2)
metricas_valid.at[1, 'KS']   = round(ks_2samp(pred_val.loc[pred_val['True 1'] == 0, 'Mod 1'], pred_val.loc[pred_val['True 1'] == 1, 'Mod 1'])[0]*100, 2)
metricas_valid.at[1, 'F1']   = round(f1_score(np.asarray(pred_val['True 1']), np.asarray(pred_val['Mod 1'].round()), average = 'binary')*100,2)
metricas_valid.at[1, 'Rec']  = round(recall_score(np.asarray(pred_val['True 1']), np.asarray(pred_val['Mod 1'].round()))*100,2)
metricas_valid.at[1, 'Prec'] = round(precision_score(np.asarray(pred_val['True 1']), np.asarray(pred_val['Mod 1'].round()))*100,2)
metricas_valid.at[1, 'Acc']  = round(accuracy_score(np.asarray(pred_val['True 1']), np.asarray(pred_val['Mod 1'].round()), normalize=True)*100,2)

metricas_valid.at[2, 'Data'] = 'Valid - Faixa 3'
metricas_valid.at[2, 'ROC']  = round(roc_auc_score(np.asarray(pred_val['True 2']), np.asarray(pred_val['Mod 2']))*100, 2)
metricas_valid.at[2, 'KS']   = round(ks_2samp(pred_val.loc[pred_val['True 2'] == 0, 'Mod 2'], pred_val.loc[pred_val['True 2'] == 1, 'Mod 2'])[0]*100, 2)
metricas_valid.at[2, 'F1']   = round(f1_score(np.asarray(pred_val['True 2']), np.asarray(pred_val['Mod 2'].round()), average = 'binary')*100,2)
metricas_valid.at[2, 'Rec']  = round(recall_score(np.asarray(pred_val['True 2']), np.asarray(pred_val['Mod 2'].round()))*100,2)
metricas_valid.at[2, 'Prec'] = round(precision_score(np.asarray(pred_val['True 2']), np.asarray(pred_val['Mod 2'].round()))*100,2)
metricas_valid.at[2, 'Acc']  = round(accuracy_score(np.asarray(pred_val['True 2']), np.asarray(pred_val['Mod 2'].round()), normalize=True)*100,2)

metricas_valid.at[3, 'Data'] = 'Valid - Faixa 4'
metricas_valid.at[3, 'ROC']  = round(roc_auc_score(np.asarray(pred_val['True 3']), np.asarray(pred_val['Mod 3']))*100, 2)
metricas_valid.at[3, 'KS']   = round(ks_2samp(pred_val.loc[pred_val['True 3'] == 0, 'Mod 3'], pred_val.loc[pred_val['True 3'] == 1, 'Mod 3'])[0]*100, 2)
metricas_valid.at[3, 'F1']   = round(f1_score(np.asarray(pred_val['True 3']), np.asarray(pred_val['Mod 3'].round()), average = 'binary')*100,2)
metricas_valid.at[3, 'Rec']  = round(recall_score(np.asarray(pred_val['True 3']), np.asarray(pred_val['Mod 3'].round()))*100,2)
metricas_valid.at[3, 'Prec'] = round(precision_score(np.asarray(pred_val['True 3']), np.asarray(pred_val['Mod 3'].round()))*100,2)
metricas_valid.at[3, 'Acc']  = round(accuracy_score(np.asarray(pred_val['True 3']), np.asarray(pred_val['Mod 3'].round()), normalize=True)*100,2)

metricas_valid.at[4, 'Data'] = 'Valid - Faixa 6'
metricas_valid.at[4, 'ROC']  = round(roc_auc_score(np.asarray(pred_val['True 4']), np.asarray(pred_val['Mod 4']))*100, 2)
metricas_valid.at[4, 'KS']   = round(ks_2samp(pred_val.loc[pred_val['True 4'] == 0, 'Mod 4'], pred_val.loc[pred_val['True 4'] == 1, 'Mod 4'])[0]*100, 2)
metricas_valid.at[4, 'F1']   = round(f1_score(np.asarray(pred_val['True 4']), np.asarray(pred_val['Mod 4'].round()), average = 'binary')*100,2)
metricas_valid.at[4, 'Rec']  = round(recall_score(np.asarray(pred_val['True 4']), np.asarray(pred_val['Mod 4'].round()))*100,2)
metricas_valid.at[4, 'Prec'] = round(precision_score(np.asarray(pred_val['True 4']), np.asarray(pred_val['Mod 4'].round()))*100,2)
metricas_valid.at[4, 'Acc']  = round(accuracy_score(np.asarray(pred_val['True 4']), np.asarray(pred_val['Mod 4'].round()), normalize=True)*100,2)
In [58]:
metricas_test = pd.DataFrame(columns = {'Data',
                                         'ROC',
                                         'Rec',
                                         'Prec',
                                         'Acc',
                                         'KS',
                                         'F1'})
metricas_test.at[0, 'Data'] = 'Teste - Faixa 1'
metricas_test.at[0, 'ROC']  = round(roc_auc_score(np.asarray(pred_test['True 0']), np.asarray(pred_test['Mod 0']))*100, 2)
metricas_test.at[0, 'KS']   = round(ks_2samp(pred_test.loc[pred_test['True 0'] == 0, 'Mod 0'], pred_test.loc[pred_test['True 0'] == 1, 'Mod 0'])[0]*100, 2)
metricas_test.at[0, 'F1']   = round(f1_score(np.asarray(pred_test['True 0']), np.asarray(pred_test['Mod 0'].round()), average = 'binary')*100,2)
metricas_test.at[0, 'Rec']  = round(recall_score(np.asarray(pred_test['True 0']), np.asarray(pred_test['Mod 0'].round()))*100,2)
metricas_test.at[0, 'Prec'] = round(precision_score(np.asarray(pred_test['True 0']), np.asarray(pred_test['Mod 0'].round()))*100,2)
metricas_test.at[0, 'Acc']  = round(accuracy_score(np.asarray(pred_test['True 0']), np.asarray(pred_test['Mod 0'].round()), normalize=True)*100,2)

metricas_test.at[1, 'Data'] = 'Teste - Faixa 2'
metricas_test.at[1, 'ROC']  = round(roc_auc_score(np.asarray(pred_test['True 1']), np.asarray(pred_test['Mod 1']))*100, 2)
metricas_test.at[1, 'KS']   = round(ks_2samp(pred_test.loc[pred_test['True 1'] == 0, 'Mod 1'], pred_test.loc[pred_test['True 1'] == 1, 'Mod 1'])[0]*100, 2)
metricas_test.at[1, 'F1']   = round(f1_score(np.asarray(pred_test['True 1']), np.asarray(pred_test['Mod 1'].round()), average = 'binary')*100,2)
metricas_test.at[1, 'Rec']  = round(recall_score(np.asarray(pred_test['True 1']), np.asarray(pred_test['Mod 1'].round()))*100,2)
metricas_test.at[1, 'Prec'] = round(precision_score(np.asarray(pred_test['True 1']), np.asarray(pred_test['Mod 1'].round()))*100,2)
metricas_test.at[1, 'Acc']  = round(accuracy_score(np.asarray(pred_test['True 1']), np.asarray(pred_test['Mod 1'].round()), normalize=True)*100,2)

metricas_test.at[2, 'Data'] = 'Teste - Faixa 3'
metricas_test.at[2, 'ROC']  = round(roc_auc_score(np.asarray(pred_test['True 2']), np.asarray(pred_test['Mod 2']))*100, 2)
metricas_test.at[2, 'KS']   = round(ks_2samp(pred_test.loc[pred_test['True 2'] == 0, 'Mod 2'], pred_test.loc[pred_test['True 2'] == 1, 'Mod 2'])[0]*100, 2)
metricas_test.at[2, 'F1']   = round(f1_score(np.asarray(pred_test['True 2']), np.asarray(pred_test['Mod 2'].round()), average = 'binary')*100,2)
metricas_test.at[2, 'Rec']  = round(recall_score(np.asarray(pred_test['True 2']), np.asarray(pred_test['Mod 2'].round()))*100,2)
metricas_test.at[2, 'Prec'] = round(precision_score(np.asarray(pred_test['True 2']), np.asarray(pred_test['Mod 2'].round()))*100,2)
metricas_test.at[2, 'Acc']  = round(accuracy_score(np.asarray(pred_test['True 2']), np.asarray(pred_test['Mod 2'].round()), normalize=True)*100,2)

metricas_test.at[3, 'Data'] = 'Teste - Faixa 4'
metricas_test.at[3, 'ROC']  = round(roc_auc_score(np.asarray(pred_test['True 3']), np.asarray(pred_test['Mod 3']))*100, 2)
metricas_test.at[3, 'KS']   = round(ks_2samp(pred_test.loc[pred_test['True 3'] == 0, 'Mod 3'], pred_test.loc[pred_test['True 3'] == 1, 'Mod 3'])[0]*100, 2)
metricas_test.at[3, 'F1']   = round(f1_score(np.asarray(pred_test['True 3']), np.asarray(pred_test['Mod 3'].round()), average = 'binary')*100,2)
metricas_test.at[3, 'Rec']  = round(recall_score(np.asarray(pred_test['True 3']), np.asarray(pred_test['Mod 3'].round()))*100,2)
metricas_test.at[3, 'Prec'] = round(precision_score(np.asarray(pred_test['True 3']), np.asarray(pred_test['Mod 3'].round()))*100,2)
metricas_test.at[3, 'Acc']  = round(accuracy_score(np.asarray(pred_test['True 3']), np.asarray(pred_test['Mod 3'].round()), normalize=True)*100,2)

metricas_test.at[4, 'Data'] = 'Teste - Faixa 6'
metricas_test.at[4, 'ROC']  = round(roc_auc_score(np.asarray(pred_test['True 4']), np.asarray(pred_test['Mod 4']))*100, 2)
metricas_test.at[4, 'KS']   = round(ks_2samp(pred_test.loc[pred_test['True 4'] == 0, 'Mod 4'], pred_test.loc[pred_test['True 4'] == 1, 'Mod 4'])[0]*100, 2)
metricas_test.at[4, 'F1']   = round(f1_score(np.asarray(pred_test['True 4']), np.asarray(pred_test['Mod 4'].round()), average = 'binary')*100,2)
metricas_test.at[4, 'Rec']  = round(recall_score(np.asarray(pred_test['True 4']), np.asarray(pred_test['Mod 4'].round()))*100,2)
metricas_test.at[4, 'Prec'] = round(precision_score(np.asarray(pred_test['True 4']), np.asarray(pred_test['Mod 4'].round()))*100,2)
metricas_test.at[4, 'Acc']  = round(accuracy_score(np.asarray(pred_test['True 4']), np.asarray(pred_test['Mod 4'].round()), normalize=True)*100,2)
In [59]:
#metricas_train = metricas_train[['Data', 'KS', 'ROC', 'F1', 'Rec', 'Prec', 'Acc']].set_index('Data')
metricas_train = metricas_train[['Data', 'KS', 'ROC', 'Acc']].set_index('Data')
metricas_train
Out[59]:
KS ROC Acc
Data
Treino - Faixa 1 56.7 86.97 79.87
Treino - Faixa 2 75.43 95.36 97.15
Treino - Faixa 3 75.89 95.04 97.69
Treino - Faixa 4 58.19 87.75 79.17
Treino - Faixa 6 80.81 96.86 97.3
In [60]:
#metricas_valid = metricas_valid[['Data', 'KS', 'ROC', 'F1', 'Rec', 'Prec', 'Acc']].set_index('Data')
metricas_valid = metricas_valid[['Data', 'KS', 'ROC', 'Acc']].set_index('Data')
metricas_valid
Out[60]:
KS ROC Acc
Data
Valid - Faixa 1 54.92 85.95 79.16
Valid - Faixa 2 70.86 93.69 96.87
Valid - Faixa 3 73.63 94.22 97.59
Valid - Faixa 4 56.99 86.92 78.54
Valid - Faixa 6 77.87 95.68 97.23
In [61]:
#metricas_test = metricas_test[['Data', 'KS', 'ROC', 'F1', 'Rec', 'Prec', 'Acc']].set_index('Data')
metricas_test = metricas_test[['Data', 'KS', 'ROC','Acc']].set_index('Data')
metricas_test
Out[61]:
KS ROC Acc
Data
Teste - Faixa 1 53.84 85.27 77.75
Teste - Faixa 2 52.83 85.77 98.66
Teste - Faixa 3 42.9 79.17 99.44
Teste - Faixa 4 53.35 84.87 77.14
Teste - Faixa 6 54.36 84.98 99.18

CURVAS ROC

In [62]:
title_font = {'fontname' : 'Arial',
              'size'     : '17',
              'weight'   : 'bold'}
axis_font  = {'fontname' : 'Arial',
              'size'     : '12'}
In [63]:
lr_fpr_0, lr_tpr_0, _ = roc_curve(np.asarray(pred_train['True 0']), np.asarray(pred_train['Mod 0']))
lr_fpr_1, lr_tpr_1, _ = roc_curve(np.asarray(pred_train['True 1']), np.asarray(pred_train['Mod 1']))
lr_fpr_2, lr_tpr_2, _ = roc_curve(np.asarray(pred_train['True 2']), np.asarray(pred_train['Mod 2']))
lr_fpr_3, lr_tpr_3, _ = roc_curve(np.asarray(pred_train['True 3']), np.asarray(pred_train['Mod 3']))
lr_fpr_4, lr_tpr_4, _ = roc_curve(np.asarray(pred_train['True 4']), np.asarray(pred_train['Mod 4']))
plt.plot([0.0, 1.0], [0.0, 1.0], 'r--', linewidth = 0.5, label = 'Coin', color = 'black')
plt.plot(lr_fpr_0, lr_tpr_0, linewidth = 0.5, label = '1', color = 'blue')
plt.plot(lr_fpr_1, lr_tpr_1, linewidth = 0.5, label = '2', color = 'red')
plt.plot(lr_fpr_2, lr_tpr_2, linewidth = 0.5, label = '3', color = 'orange')
plt.plot(lr_fpr_3, lr_tpr_3, linewidth = 0.5, label = '4', color = 'green')
plt.plot(lr_fpr_4, lr_tpr_4, linewidth = 0.5, label = '6', color = 'darkmagenta')
plt.title('Curva ROC - Treino', title_font)
plt.xlabel('False Positive Rate', axis_font)
plt.ylabel('True Positive Rate', axis_font)
plt.legend()
Out[63]:
<matplotlib.legend.Legend at 0x7f2b9b365278>
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
In [64]:
lr_fpr_0, lr_tpr_0, _ = roc_curve(np.asarray(pred_val['True 0']), np.asarray(pred_val['Mod 0']))
lr_fpr_1, lr_tpr_1, _ = roc_curve(np.asarray(pred_val['True 1']), np.asarray(pred_val['Mod 1']))
lr_fpr_2, lr_tpr_2, _ = roc_curve(np.asarray(pred_val['True 2']), np.asarray(pred_val['Mod 2']))
lr_fpr_3, lr_tpr_3, _ = roc_curve(np.asarray(pred_val['True 3']), np.asarray(pred_val['Mod 3']))
lr_fpr_4, lr_tpr_4, _ = roc_curve(np.asarray(pred_val['True 4']), np.asarray(pred_val['Mod 4']))
plt.plot([0.0, 1.0], [0.0, 1.0], 'r--', linewidth = 0.5, label = 'Coin', color = 'black')
plt.plot(lr_fpr_0, lr_tpr_0, linewidth = 0.5, label = '1', color = 'blue')
plt.plot(lr_fpr_1, lr_tpr_1, linewidth = 0.5, label = '2', color = 'red')
plt.plot(lr_fpr_2, lr_tpr_2, linewidth = 0.5, label = '3', color = 'orange')
plt.plot(lr_fpr_3, lr_tpr_3, linewidth = 0.5, label = '4', color = 'green')
plt.plot(lr_fpr_4, lr_tpr_4, linewidth = 0.5, label = '6', color = 'darkmagenta')
plt.title('Curva ROC - Validação', title_font)
plt.xlabel('False Positive Rate', axis_font)
plt.ylabel('True Positive Rate', axis_font)
plt.legend()
Out[64]:
<matplotlib.legend.Legend at 0x7f2b9b435390>
In [65]:
lr_fpr_0, lr_tpr_0, _ = roc_curve(np.asarray(pred_test['True 0']), np.asarray(pred_test['Mod 0']))
lr_fpr_1, lr_tpr_1, _ = roc_curve(np.asarray(pred_test['True 1']), np.asarray(pred_test['Mod 1']))
lr_fpr_2, lr_tpr_2, _ = roc_curve(np.asarray(pred_test['True 2']), np.asarray(pred_test['Mod 2']))
lr_fpr_3, lr_tpr_3, _ = roc_curve(np.asarray(pred_test['True 3']), np.asarray(pred_test['Mod 3']))
lr_fpr_4, lr_tpr_4, _ = roc_curve(np.asarray(pred_test['True 4']), np.asarray(pred_test['Mod 4']))
plt.plot([0.0, 1.0], [0.0, 1.0], 'r--', linewidth = 0.5, label = 'Coin', color = 'black')
plt.plot(lr_fpr_0, lr_tpr_0, linewidth = 0.5, label = '1', color = 'blue')
plt.plot(lr_fpr_1, lr_tpr_1, linewidth = 0.5, label = '2', color = 'red')
plt.plot(lr_fpr_2, lr_tpr_2, linewidth = 0.5, label = '3', color = 'orange')
plt.plot(lr_fpr_3, lr_tpr_3, linewidth = 0.5, label = '4', color = 'green')
plt.plot(lr_fpr_4, lr_tpr_4, linewidth = 0.5, label = '6', color = 'darkmagenta')
plt.title('Curva ROC - Teste', title_font)
plt.xlabel('False Positive Rate', axis_font)
plt.ylabel('True Positive Rate', axis_font)
plt.legend()
Out[65]:
<matplotlib.legend.Legend at 0x7f2b98e0d6a0>

CREATE ENDPOINT FOR PREDICTION

In [66]:
predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
---------------!
In [72]:
from sagemaker.predictor import csv_serializer

predictor.serializer = csv_serializer
In [73]:
predictor.__dict__.keys()
Out[73]:
dict_keys(['endpoint_name', 'sagemaker_session', 'serializer', 'deserializer', '_endpoint_config_name', '_model_names', '_context'])
In [74]:
predictor.endpoint
The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Out[74]:
'xgboost-2021-03-30-01-28-58-053'
In [75]:
#DELETAR O ENDPOINT POIS BILLA MUITO!!!!!!!!!!!!!!!!!
#DELETAR O ENDPOINT POIS BILLA MUITO!!!!!!!!!!!!!!!!!
#DELETAR O ENDPOINT POIS BILLA MUITO!!!!!!!!!!!!!!!!!
#DELETAR O ENDPOINT POIS BILLA MUITO!!!!!!!!!!!!!!!!!
#DELETAR O ENDPOINT POIS BILLA MUITO!!!!!!!!!!!!!!!!!
predictor.delete_endpoint()
In [ ]: