## Normally only the last output in the cell gets pretty printed - the rest you have to manually add print() which is not very convenient.
from IPython.core.interactiveshell import InteractiveShell
# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "all"
%load_ext skip_kernel_extension
## Disable warnings in Anaconda
import warnings
warnings.filterwarnings('ignore')
import time
import numpy as np
import pandas as pd
######################################## PLOTTING ########################################
## Note that the implementation of the plot() method in pandas is based on matplotlib.
### MATPLOTLIB (https://matplotlib.org/index.html)
## We will display plots right inside Jupyter Notebook
%matplotlib inline
import matplotlib.pyplot as plt
### SEABORN (https://seaborn.pydata.org/introduction.html)
### SEABORN is essentially a higher-level API based on the matplotlib library.
import seaborn as sns
sns.set() # sns.set(color_codes=True)
## Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'
## Increase the default plot size and set the color scheme
plt.rcParams['figure.figsize'] = 8, 5
plt.rcParams['image.cmap'] = 'viridis'
### PLOTLY (https://plot.ly/python/)
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
##########################################################################################
from scipy import stats
def print_df_shape(dfs):
for name, df in dfs:
print('{0} data has {1} rows and {2} columns.'.format(name, df.shape[0], df.shape[1]), '\n')
def roundti(val):
return int(round(val))
# Returns the list of column names with NaN values
def get_cols_w_missing_data(df, verbose = True):
result = df.isnull().sum()[lambda x: x > 0]
if verbose:
print('Columns with missing data:\n\n', result, sep='')
return result, list(result.index)
def gen_groupby(df, col_name, agg_fn, grpby_col_names = ''):
if (grpby_col_names == '') | (len(grpby_col_names) == 0):
df = df.agg({col_name: [agg_fn]})
else:
df = df.groupby(grpby_col_names).agg({col_name: agg_fn})
return df
## Filling missing values
def gen_fillna(df, col_name, agg_fn, grpby_col_names = '', round_to_int = False):
df_agg = gen_groupby(df, col_name, agg_fn, grpby_col_names)
if (grpby_col_names == '') | (len(grpby_col_names) == 0):
agg_val = df_agg.iloc[0,0]
#df[col_name] = df[col_name].fillna(agg_val)
df[col_name].fillna(agg_val, inplace=True)
else:
if isinstance(grpby_col_names, str):
grpby_col_names = [grpby_col_names]
if len(grpby_col_names) == 1:
for ndx, row in df_agg.iterrows():
agg_val = row[col_name]
if round_to_int == True:
agg_val = round(agg_val)
df.loc[df[grpby_col_names[0]] == ndx, col_name] = df.loc[df[grpby_col_names[0]] == ndx, col_name].fillna(agg_val)
elif len(grpby_col_names) == 2:
for ndx, row in df_agg.iterrows():
agg_val = row[col_name]
if round_to_int == True:
agg_val = round(agg_val)
df_nan_rows = df.loc[(df[grpby_col_names[0]] == ndx[0]) & (df[grpby_col_names[1]] == ndx[1]), col_name].isnull()[lambda x: x > 0]
if (len(df_nan_rows) > 0):
df.loc[df_nan_rows.index, col_name] = agg_val
return df
##### Detecting and handling outliers
## Replacing invalid percent values
def replace_perc_values_outliers(df, col_name, lower_val = 0, upper_val = 100):
lower_outliers = df[col_name] < 0
upper_outliers = df[col_name] > 100
print('Count of outliers of "{0}", lower = {1}, upper = {2}"'.format(col_name, str(lower_outliers.sum()), str(upper_outliers.sum())))
if lower_outliers.sum() > 0:
df.loc[lower_outliers, col_name] = lower_val
if upper_outliers.sum() > 0:
df.loc[upper_outliers, col_name] = upper_val
## Detecting outliers using IQR method
def detect_iqr_outliers(srs, replace = False, verbose = True):
quantile_1, quantile_3 = srs.quantile([0.25, 0.75])
iqr = quantile_3 - quantile_1
lower_bound = quantile_1 - (iqr * 1.5)
upper_bound = quantile_3 + (iqr * 1.5)
#print('lower_bound: {0}, upper_bound: {1}, IQR: {2}, quantile_1: {3}, quantile_3: {4}'.format(round(lower_bound, 1), round(upper_bound, 1), round(iqr, 1), round(quantile_1, 1), round(quantile_3, 1)))
#print('lower_bound: {0}, upper_bound: {1}, IQR: {2}'.format(round(lower_bound, 1), round(upper_bound, 1), round(iqr, 1)))
result = srs[(srs > upper_bound) | (srs < lower_bound)]
if verbose:
print(srs.name, 'outlier count:', len(result))
if (replace == True):
srs[srs < lower_bound] = quantile_1
srs[srs > upper_bound] = quantile_3
return srs
else:
return result
## Replacing\Removing outliers using IQR method
def handle_iqr_outliers(df, col_list = [], replace = True):
if len(col_list) == 0:
col_list = df.select_dtypes(include=[np.number]).columns.tolist()
out_ndxs = pd.Index([])
for col_name in col_list:
if (replace == True):
df[col_name] = detect_iqr_outliers(df[col_name].copy(), True)
else:
out_ndxs = out_ndxs.append(detect_iqr_outliers(df[col_name]).index)
if (replace == False):
out_ndxs = out_ndxs.drop_duplicates()
df.drop(out_ndxs, inplace=True)
return df
## Detecting outliers using MAD (Median Absolute Deviation)
## https://stackoverflow.com/questions/22354094/pythonic-way-of-detecting-outliers-in-one-dimensional-observation-data
def detect_mad_outliers(srs, thresh = 3.5, verbose = True):
median = srs.median()
diff = (srs - median) ** 2
diff = diff.pow(0.5)
med_abs_deviation = diff.median()
modified_z_score = 0.6745 * diff / med_abs_deviation
result = srs[modified_z_score > thresh]
if verbose:
print(srs.name, 'outlier count:', len(result))
return result
df_train_x = pd.read_csv('../input/train_values.csv')
df_train_y = pd.read_csv('../input/train_labels.csv')
df_train = pd.merge(df_train_x, df_train_y, on="row_id", how="inner")
del df_train_x, df_train_y
y_label = 'evictions'
df_score = pd.read_csv('../input/test_values.csv')
df_score[y_label] = -1
print_df_shape(zip(['Training', 'Score'], [df_train, df_score]))
df_train.head()
This will tell us the total number of non null observations
present including the total number of entries
. Once number of entries isn’t equal to number of non null observations, we can begin to suspect missing values.
df_train.info()
id_col = 'row_id'
id_cols = ['row_id', 'county_code', 'year', 'state']
# print statistics of numerical features (int64 and float64 types)
df_train.describe()
non_num_cols = list(set(df_train.select_dtypes(include=['object', 'bool']).columns) - set(id_cols))
# print statistics on non-numerical features
df_train[non_num_cols].describe()
#df_train[df_train.select_dtypes(include=['object', 'bool']).columns].describe()
## Concat Train and Score sets
df_trsc = pd.concat([df_train, df_score], ignore_index=True, sort=False)
print_df_shape(zip(['Training + Score'], [df_trsc]))
# DECLARE: Find out if certain columns are unnecessarily floats
df_tmp = df_trsc.copy()
df_tmp.fillna(0, inplace = True)
float_cols = list(df_train.select_dtypes(include=[float]).columns)
# = ['population', 'renter_occupied_households', 'median_gross_rent', 'median_household_income', 'median_property_value']
poss_int_cols_list = []
for col_name in float_cols:
if np.array_equal(df_tmp[col_name], df_tmp[col_name].astype(int)):
poss_int_cols_list.append(col_name)
print('Following columns are actually integer instead of float:\n', poss_int_cols_list)
missing_data, missing_data_cols = get_cols_w_missing_data(df_trsc)
#df_trsc[df_trsc['median_gross_rent'].isnull()]
#df_train[df_train['median_household_income'].isnull()]
#df_train[df_train['median_property_value'].isnull()]
#df_trsc[df_trsc['air_pollution_particulate_matter_value'].isnull()]
missing_val_treshold = 0.3
print_df_shape(zip(['Training', 'Score'], [df_train, df_score]))
train_col_count = df_train.shape[1]
## Apply a count over the rows
train_rows_w_na = train_col_count - df_train.apply(lambda x: x.count(), axis = 1)
## Rows that have any NA and the number of columns with NA (pandas.core.series.Series)
rows_w_too_much_na = train_rows_w_na[(train_rows_w_na / train_col_count > missing_val_treshold)]
print('Number of rows with more than %{0} NA in train data: {1}'.format(str(missing_val_treshold * 100), str(rows_w_too_much_na.count())))
## Remove rows with more than 30% NA
if (rows_w_too_much_na.count() > 0):
df_train.drop(rows_w_too_much_na.index, inplace = True, axis = 0)
print('Training data has {0} rows and {1} columns'.format(df_train.shape[0], df_train.shape[1]))
df_train.head()
else:
print('NO rows removed: There are NO rows with more than %{0} NA in train data'.format(str(missing_val_treshold * 100)))
train_row_count = df_train.shape[0]
## Columns that have any NA and the number of rows with NA (pandas.core.series.Series)
train_cols_w_na = train_row_count - df_train.loc[:, df_train.isnull().any()].count()
cols_w_too_much_na = train_cols_w_na[(train_cols_w_na / train_row_count > missing_val_treshold)]
print('Number of columns with more than %' + str(missing_val_treshold * 100) + ' NA in train data: ' + str(cols_w_too_much_na.count()),
cols_w_too_much_na,
sep='\n\n', end='\n\n')
## Remove columns with more than 30% NA in train data, then drop same columns from score data
if (cols_w_too_much_na.count() > 0):
df_train.drop(cols_w_too_much_na.keys(), inplace = True, axis = 1)
## Drop same columns from score data
df_score.drop(cols_w_too_much_na.keys(), inplace = True, axis = 1)
### Create dataframe of original obesity and animal_protein values for use in second model
## after running first model predictions
#df_train_org = df_train[['row_id', 'obesity_prevalence', 'avg_supply_of_protein_of_animal_origin', 'prevalence_of_undernourishment', 'country_code']]
#df_score_org = df_score[['row_id', 'obesity_prevalence', 'avg_supply_of_protein_of_animal_origin', 'country_code']]
print_df_shape(zip(['Training', 'Score'], [df_train, df_score]))
## Concat Train and Score sets
df_trsc = pd.concat([df_train, df_score], ignore_index=True, sort=False)
print_df_shape(zip(['Training + Score'], [df_trsc]))
missing_data, missing_data_cols = get_cols_w_missing_data(df_trsc)
## TEST filling missing values
df_tmp = df_trsc.copy()
test_col_name = 'median_gross_rent'
test_state = '0df5b61'
test_county_code = 'c09169c'
df_missing_val_rows = df_tmp[df_tmp[test_col_name].isnull()][id_cols + [test_col_name]]
print('BeforeImputation', df_tmp.iloc[df_missing_val_rows.index][id_cols + [test_col_name]], '\n')
print(df_tmp[df_tmp['state'] == test_state].groupby(['state', 'year'])['median_gross_rent'].median())
gen_groupby(df_tmp[df_tmp['state'] == test_state], test_col_name, 'median', ['state', 'year'])
gen_fillna(df_tmp, test_col_name, 'median', ['state', 'year'], round_to_int = True)
print('AfterImputation', df_tmp.iloc[df_missing_val_rows.index][id_cols + [test_col_name]])
## CHECK
df_tmp = df_trsc.copy()
for col_name in missing_data_cols:
missing_data_states = list(df_tmp[df_tmp[col_name].isnull()]['state'].unique())
print(col_name, '\n', df_tmp[df_tmp['state'].isin(missing_data_states)].groupby(['state', 'year'])[col_name].agg(['mean','median']), '\n')
## APPLY: Replace missing values with the median values of their corresponding state and year
for col_name in missing_data_cols:
round_to_int = False
if col_name in poss_int_cols_list:
round_to_int = True
gbg_out = gen_fillna(df_trsc, col_name, 'median', ['state', 'year'], round_to_int)
print_df_shape(zip(['Training + Score'], [df_trsc]))
missing_data, missing_data_cols = get_cols_w_missing_data(df_trsc)
air_pollution_particulate_matter_value
missing values weren't be able to imputed the state
of those rows with missing values has no value for this column.
## CHECK missing values of 'air_pollution_particulate_matter_value'
col_name = 'air_pollution_particulate_matter_value'
missing_data_states = list(df_trsc[df_trsc[col_name].isnull()]['state'].unique())
df_trsc[df_trsc['state'].isin(missing_data_states)][id_cols + [col_name, y_label]]
col_values = df_trsc.copy()[col_name].fillna(0).values
gbg_out = sns.distplot(col_values)
## APPLY: Replace missing values of 'air_pollution_particulate_matter_value' with the mean value of total
col_name = 'air_pollution_particulate_matter_value'
gbg_out = gen_fillna(df_trsc, col_name, 'mean')
get_cols_w_missing_data(df_trsc)
# APPLY: Change unnecessary 'float' columns to 'int'
for col_name in poss_int_cols_list:
if np.array_equal(df_trsc[col_name], df_trsc[col_name].astype(int)):
df_trsc[col_name] = df_trsc[col_name].astype('int64')
df_trsc_2 = df_trsc.copy()
#df_trsc = df_trsc_2.copy()
num_cols = list(set(df_train.select_dtypes(include=['int', 'float']).columns) - set(id_cols))
## CHECK: Columns with invalid percent values
pct_col_names = []
for col_name in num_cols:
if col_name.startswith('pct_'):
pct_col_names.append(col_name)
srs = df_trsc[(df_trsc[pct_col_names] < 0) | (df_trsc[pct_col_names] > 1)][pct_col_names].count()
srs[srs > 0]
## APPLY: Fix 'pct_renter_occupied' column values dividing by 100
df_trsc['pct_renter_occupied'] = df_trsc['pct_renter_occupied'].apply(lambda x: x / 100.0)
We should detect and handle the outliers grouping by state
.
sns.set_style("whitegrid")
sns.set_context("paper")
ax = sns.catplot(data = df_trsc, x = 'population', y = 'state', orient='h', height = 10, aspect=0.7)
num_cols = list(set(df_trsc.select_dtypes(include=['int', 'float']).columns) - set(id_cols))
verbose_output = False
df_by_state = df_trsc.groupby(['state'])
col_names_w_ol = pd.DataFrame(columns=['IQR', 'MAD'])
for col_name in num_cols:
iqr_outl_states_for_col = detect_iqr_outliers(df_by_state[col_name].mean(), verbose = verbose_output)
if (verbose_output & (len(iqr_outl_states_for_col) > 0)):
print('IQR:\n', iqr_outl_states_for_col, '\n')
mad_outl_states_for_col = detect_mad_outliers(df_by_state[col_name].mean(), verbose = verbose_output)
if (verbose_output & (len(mad_outl_states_for_col) > 0)):
print('MAD:\n', mad_outl_states_for_col, '\n')
if ((len(iqr_outl_states_for_col) > 0) | (len(mad_outl_states_for_col) > 0)):
#col_names_w_ol[col_name] = [iqr_outl_states_for_col, mad_outl_states_for_col] ## Interesting result!
col_names_w_ol.loc[col_name] = [len(iqr_outl_states_for_col), len(mad_outl_states_for_col)]
if verbose_output:
print('\n\n')
col_names_w_ol
Create engineered features (pop_per_occupied_households
etc.)
## APPLY: Create engineered features (land shares, population shares, etc.)
df_trsc['pop_per_occupied_households'] = df_trsc['population'] / (df_trsc['renter_occupied_households'] / df_trsc['pct_renter_occupied'])
print_df_shape(zip(['Training + Score'], [df_trsc]))
df_trsc[['renter_occupied_households', 'pct_renter_occupied', 'population', 'pop_per_occupied_households']].head()
## ANALYZE
df_tmp = df_trsc.copy()
cols_pct_ethnicity = ['pct_white', 'pct_af_am', 'pct_hispanic', 'pct_am_ind', 'pct_asian', 'pct_nh_pi', 'pct_multiple', 'pct_other']
cols_pop_ethnicity = pd.Series(cols_pct_ethnicity).map(lambda x: 'pop_' + x[4:]).values
for col_pct, col_pop in zip(cols_pct_ethnicity, cols_pop_ethnicity):
df_tmp[col_pop] = df_tmp['population'] * df_tmp[col_pct]
gbg_out = df_tmp[cols_pop_ethnicity].sum().plot.bar()
df_tmp[cols_pct_ethnicity].head()
We can add the values of 'pct_am_ind'
, 'pct_nh_pi'
, 'pct_multiple'
columns to 'pct_other'
and remove those columns.
# APPLY (tmp):
df_tmp = df_trsc.copy()
cols_to_merge = ['pct_am_ind', 'pct_nh_pi', 'pct_multiple'] # 'pct_asian'
df_tmp['pct_other'] = df_tmp[cols_to_merge + ['pct_other']].sum(axis = 1)
df_tmp.drop(columns = cols_to_merge, inplace = True)
# CHECK:
cols_pct_ethnicity = list(set(cols_pct_ethnicity) - set(cols_to_merge))
cols_pop_ethnicity = pd.Series(cols_pct_ethnicity).map(lambda x: 'pop_' + x[4:]).values
for col_pct, col_pop in zip(cols_pct_ethnicity, cols_pop_ethnicity):
df_tmp[col_pop] = df_tmp['population'] * df_tmp[col_pct]
gbg_out = df_tmp[cols_pop_ethnicity].sum().plot.bar()
df_tmp[cols_pct_ethnicity].head()
# APPLY:
cols_to_merge = ['pct_am_ind', 'pct_nh_pi', 'pct_multiple'] # 'pct_asian'
df_trsc['pct_other'] = df_trsc[cols_to_merge + ['pct_other']].sum(axis = 1)
df_trsc.drop(columns = cols_to_merge, inplace = True)
print_df_shape(zip(['Training + Score'], [df_trsc]))
df_trsc_3 = df_trsc.copy()
#df_trsc = df_trsc_2.copy()
String
type id
Columns¶## CHECK
df_tmp = df_trsc.copy()
# Apply 'Label Encoding' to 'county_code' and 'state' columns
df_tmp['county_code'] = df_tmp['county_code'].astype('category').cat.codes + 1001
df_tmp['state'] = df_tmp['state'].astype('category').cat.codes + 101
df_tmp['year'] = df_tmp['year'].astype('category').cat.codes + 1
df_tmp.dtypes[:4]
## APPLY
df_trsc['county_code'] = df_trsc['county_code'].astype('category').cat.codes + 1001
df_trsc['state'] = df_trsc['state'].astype('category').cat.codes + 101
df_trsc['year'] = df_trsc['year'].astype('category').cat.codes + 1
# ANALYZE
non_num_cols = list(set(df_trsc.select_dtypes(include=['object', 'bool']).columns) - set(id_cols))
for col_name in non_num_cols:
print(df_trsc[col_name].value_counts(), '\n')
## https://www.datacamp.com/community/tutorials/categorical-data
## Try one-hot encoding for 'economic_topology'
## Try replacing values for 'rucc' and 'urban_influence' with incremental codes
## Try Binary Encoding for 'rucc' and 'urban_influence' which will produce less features
#df_train.where(df_train ['Age']==30)
#%timeit
## TEST
df_tmp = df_trsc.copy()
f, arr_ax = plt.subplots(1, 3, figsize=(13, 6))
for ndx, col_name in enumerate(non_num_cols):
df_tmp[col_name] = df_tmp[col_name].astype('category').cat.codes
sns.countplot(x = col_name, data = df_tmp, ax = arr_ax[ndx])
## APPLY
for ndx, col_name in enumerate(non_num_cols):
df_trsc[col_name] = df_trsc[col_name].astype('category').cat.codes
num_cols = list(set(df_trsc.select_dtypes(include=['int', 'float']).columns) - set(id_cols))
plt_col_count = 4
f, arr_ax = plt.subplots(round(len(num_cols) / plt_col_count), plt_col_count, figsize=(14, 40))
for ndx, col_name in enumerate(num_cols):
rn = int(ndx / plt_col_count)
cn = ndx % plt_col_count
sns.distplot(df_trsc[col_name], ax = arr_ax[rn, cn], axlabel=col_name);
from scipy.stats import skew
df_skew = df_trsc[num_cols].apply(skew).sort_values()
print('Positively skewed features:\n\n', df_skew[df_skew > 1], '\n', sep='')
print('Negatively skewed features:\n\n', df_skew[df_skew < -1], '\n', sep='')
df_trsc_4 = df_trsc.copy()
#df_trsc = df_trsc_4.copy()
df_trsc['ln_renter_occupied_households'] = np.log(df_trsc['renter_occupied_households'])
df_trsc['ln_population'] = np.log(df_trsc['population'])
df_trsc.drop(columns = ['renter_occupied_households', 'population'], inplace = True)
First split back our merged dataset back to TRAIN and SCORE datasets.
## Split back df_trsc in to TRAIN+TEST and SCORE datasets
df_train = df_trsc[df_trsc[y_label] != -1]
df_score = df_trsc[df_trsc[y_label] == -1]
print_df_shape(zip(['Training + Score'], [df_trsc]))
print_df_shape(zip(['Training', 'Score'], [df_train, df_score]))
predictor_lbls = df_train.columns.tolist()
predictor_lbls.remove(y_label)
predictor_lbls.remove(id_col)
df_train_x = df_train[predictor_lbls]
df_train_y = df_train[[y_label]]
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
## Additional sklearn functions
from sklearn.model_selection import cross_validate
from sklearn import metrics
## Performing grid search
from sklearn.model_selection import GridSearchCV
tuning_disabled = False ## Usage: %%skip $tuning_disabled
def model_fit_xgb(alg, dtrain, predictor_lbls, target_lbl, useTrainCV = True, cv_folds = 5, early_stopping_rounds = 50):
df_x = dtrain[predictor_lbls]
df_y = dtrain[target_lbl]
if useTrainCV:
xgb_params = alg.get_xgb_params()
#dm_train = xgb.DMatrix(df_x.values, label = df_y.values)
dm_train = xgb.DMatrix(df_x, label = df_y)
cv_results = xgb.cv(xgb_params,
dm_train,
num_boost_round = alg.get_params()['n_estimators'],
nfold = cv_folds,
# stratified = True, # makes the result worse
metrics = 'rmse',
early_stopping_rounds = early_stopping_rounds)
opt_num_boost_round = cv_results.shape[0]
print('Optimum num_boost_round: ', opt_num_boost_round, '\n', cv_results.tail(1))
alg.set_params(n_estimators = opt_num_boost_round)
print('\n','get_xgb_params:', alg.get_xgb_params())
#print('\n','get_params:', alg.get_params())
#Fit the algorithm on the data
%timeit alg.fit(df_x, df_y, eval_metric = 'rmse', verbose = True)
#Predict training set:
df_y_pred = alg.predict(df_x)
#Print model report:
print('\nModel Report')
print('RMSE : %.2f' % metrics.mean_squared_error(df_y, df_y_pred))
print('R2 Score (Train): %f' % metrics.r2_score(df_y, df_y_pred))
feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title = 'Feature Importances')
plt.ylabel('Feature Importance Score')
We will use an approach similar to that of GBM here. The various steps to be performed are:
learning rate
. Generally a learning rate of 0.1 works but somewhere between 0.05 to 0.3 should work for different problems. Determine the optimum number of trees for this learning rate. XGBoost has a very useful function called as “cv”
which performs cross-validation at each boosting iteration and thus returns the optimum number of trees required.max_depth
, min_child_weight
, gamma
, subsample
, colsample_bytree
) for decided learning rate and number of trees. Note that we can choose different parameters to define a tree and we’ll take up an example here.lambda
, `alpha``) for xgboost which can help reduce model complexity and enhance performance.In order to decide on boosting parameters, we need to set some initial values of other parameters. Lets take the following values:
Please note that all the above are just initial estimates and will be tuned later. Lets take the default learning rate of 0.1 here and check the optimum number of trees using cv function of xgboost. The function defined above will do it for us.
%%skip $tuning_disabled
## Choose all predictors except target & IDcols to fit a model
xgb_01 = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 1000,
learning_rate = 0.1,
max_depth = 5,
min_child_weight = 1,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 4,
scale_pos_weight = 1,
seed = 123
)
model_fit_xgb(xgb_01, df_train, predictor_lbls, target_lbl = y_label)
max_depth
and min_child_weight
¶We tune these first as they will have the highest impact on model outcome. To start with, let’s set wider ranges and then we will perform another iteration for smaller ranges.
Important Note: We’ll be doing some heavy-duty
grid searched in this section which can take 15-30 mins or even more time to run depending on your system. You can vary the number of values you are testing based on what your system can handle.
%%skip $tuning_disabled
param_test_01 = {
'max_depth': range(3, 10, 2), # [3, 5, 7, 9]
'min_child_weight': range(1, 6, 2) # [1, 3, 5]
}
gsearch_01 = GridSearchCV(
estimator = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 49,
learning_rate = 0.1,
max_depth = 5,
min_child_weight = 1,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 4,
scale_pos_weight = 1,
seed = 123
),
param_grid = param_test_01,
scoring = 'r2', n_jobs = 4, iid = False, cv = 5)
gsearch_01.fit(df_train[predictor_lbls], df_train[y_label])
print('Best Score (r2): %f, Best Params: %s' % (gsearch_01.best_score_, gsearch_01.best_params_))
Here, we have run with wider intervals between values.
The ideal values are
max_depth: 7
and 1min_child_weight: 1
.
Let's go one step deeper and look for optimum values. We’ll search for values 1 above and 1 below the optimum values because we took an interval of 2.
%%skip $tuning_disabled
param_test_02 = {
'max_depth': [6, 7, 8],
'min_child_weight': [0, 1, 2]
}
gsearch_02 = GridSearchCV(
estimator = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 49,
learning_rate = 0.1,
max_depth = 5,
min_child_weight = 1,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 4,
scale_pos_weight = 1,
seed = 123
),
param_grid = param_test_02,
scoring = 'r2', n_jobs = 4, iid = False, cv = 5)
gsearch_02.fit(df_train[predictor_lbls], df_train[y_label])
print('Best Score (r2): %f, Best Params: %s' % (gsearch_02.best_score_, gsearch_02.best_params_))
As we see, max_depth
didn't improve further, but min_child_weight
decreased to 0
.
Their optimum values:
'max_depth': 7, 'min_child_weight': 0
gamma
¶No we will tune gamma
value using the parameters already tuned above. Gamma
can take various values but we’ll check for 5 values here. You can go into more precise values as.
%%skip $tuning_disabled
param_test_03 = {
'gamma': [i/10.0 for i in range(0, 5)]
}
gsearch_03 = GridSearchCV(
estimator = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 49,
learning_rate = 0.1,
max_depth = 7,
min_child_weight = 0,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 4,
scale_pos_weight = 1,
seed = 123
),
param_grid = param_test_03,
scoring = 'r2', n_jobs = 4, iid = False, cv = 5)
gsearch_03.fit(df_train[predictor_lbls], df_train[y_label])
print('Best Score (r2): %f, Best Params: %s' % (gsearch_03.best_score_, gsearch_03.best_params_))
This shows that our original value of gamma: 0
is the optimum one.
Before proceeding, a good idea would be to re-calibrate the number of boosting rounds for the updated parameters.
%%skip $tuning_disabled
xgb_02 = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 1000,
learning_rate = 0.1,
max_depth = 7,
min_child_weight = 0,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 4,
scale_pos_weight = 1,
seed = 123
)
model_fit_xgb(xgb_02, df_train, predictor_lbls, target_lbl = y_label)
## Previous score
## RMSE : 17350.75
## R2 Score (Train): 0.991210
Our R2 score
increased a little and alsoe RMSE
decreased quite nicely. On the other hand, num_boost_round
didn't change.
So far the final parameters are:
num_boost_round
: 49max_depth
: 7min_child_weight
: 0gamma
: 0subsample
and colsample_bytree
¶The next step would be try different subsample
and colsample_bytree
values. We will do this in 2 stages as well and take values 0.6,0.7,0.8,0.9
for both to start with.
%%skip $tuning_disabled
param_test_04 = {
'subsample': [i / 10.0 for i in range(6, 10)], ## [0.6, 0.7, 0.8, 0.9]
'colsample_bytree': [i / 10.0 for i in range(6, 10)] ## [0.6, 0.7, 0.8, 0.9]
}
gsearch_04 = GridSearchCV(
estimator = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 49,
learning_rate = 0.1,
max_depth = 7,
min_child_weight = 0,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 4,
scale_pos_weight = 1,
seed = 123
),
param_grid = param_test_04,
scoring = 'r2', n_jobs = 4, iid = False, cv = 5)
gsearch_04.fit(df_train[predictor_lbls], df_train[y_label])
print('Best Score (r2): %f, Best Params: %s' % (gsearch_04.best_score_, gsearch_04.best_params_))
Here, we found 'colsample_bytree': 0.9
and 'subsample': 0.8
as the optimum values Now we should try values in 0.05 interval around these.
%%skip $tuning_disabled
param_test_05 = {
'subsample': [i / 100.0 for i in range(75, 90, 5)], ## [0.75, 0.8, 0.85]
'colsample_bytree': [i / 100.0 for i in range(85, 105, 5)] ## [0.85, 0.9, 0.95, 1.0]
}
gsearch_05 = GridSearchCV(
estimator = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 49,
learning_rate = 0.1,
max_depth = 7,
min_child_weight = 0,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 4,
scale_pos_weight = 1,
seed = 123
),
param_grid = param_test_05,
scoring = 'r2', n_jobs = 4, iid = False, cv = 5)
gsearch_05.fit(df_train[predictor_lbls], df_train[y_label])
print('Best Score (r2): %f, Best Params: %s' % (gsearch_05.best_score_, gsearch_05.best_params_))
'subsample'
didn't change, but 'colsample_bytree'
increased.
Final optimum values are:
'colsample_bytree': 0.95, 'subsample': 0.8
Next step is to apply regularization to reduce overfitting. Though many people don’t use this parameters much as gamma
provides a substantial way of controlling complexity. But we should always try it.
We’ll tune 'reg_alpha'
and 'reg_lambda'
.
%%skip $tuning_disabled
param_test_06 = {
'reg_alpha': [0.01, 0.1, 1, 10, 100],
'reg_lambda': [0.01, 0.1, 1, 10, 100]
}
gsearch_06 = GridSearchCV(
estimator = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 49,
learning_rate = 0.1,
max_depth = 7,
min_child_weight = 0,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.95,
nthread = 4,
scale_pos_weight = 1,
seed = 123
),
param_grid = param_test_06,
scoring = 'r2', n_jobs = 4, iid = False, cv = 5)
gsearch_06.fit(df_train[predictor_lbls], df_train[y_label])
print('Best Score (r2): %f, Best Params: %s' % (gsearch_06.best_score_, gsearch_06.best_params_))
We can see that the CV score is higher than the previous case. 'reg_lambda'
didn't change from default = 1
, but 'reg_alpha'
decreased to 0.1 from default = 1
.
The values we tried were very widespread, so now we should try values closer to the optimum of 'reg_alpha'
, which is 0.1, to see if we get something better.
%%skip $tuning_disabled
param_test_07 = {
'reg_alpha': [0.005, 0.1, 0.15, 0.2, 0.3],
'reg_lambda': [0.5, 0.6, 0.8, 1, 1.05, 1.1]
}
gsearch_07 = GridSearchCV(
estimator = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 49,
learning_rate = 0.1,
max_depth = 7,
min_child_weight = 0,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.95,
nthread = 4,
scale_pos_weight = 1,
seed = 123
),
param_grid = param_test_07,
scoring = 'r2', n_jobs = 4, iid = False, cv = 5)
gsearch_07.fit(df_train[predictor_lbls], df_train[y_label])
print('Best Score (r2): %f, Best Params: %s' % (gsearch_07.best_score_, gsearch_07.best_params_))
Again, 'reg_lambda'
didn't change from default = 1
, but 'reg_alpha'
increased from 0.1 to 0.3. It might have a tendency to be a higher value. So now we can just tune 'reg_alpha'
between 0.3 and 1.
%%skip $tuning_disabled
param_test_08 = {
'reg_alpha': np.round(np.arange(0.3, 1, 0.1), 2) ## [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}
gsearch_08 = GridSearchCV(
estimator = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 49,
learning_rate = 0.1,
max_depth = 7,
min_child_weight = 0,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.95,
nthread = 4,
scale_pos_weight = 1,
seed = 123
),
param_grid = param_test_08,
scoring = 'r2', n_jobs = 4, iid = False, cv = 5)
gsearch_08.fit(df_train[predictor_lbls], df_train[y_label])
print('Best Score (r2): %f, Best Params: %s' % (gsearch_08.best_score_, gsearch_08.best_params_))
Yes, our 'reg_alpha'
increased from 0.3 to 0.4 and our CV score also increased a little.
Now we can apply this regularization in the model and look at the impact:
%%skip $tuning_disabled
xgb_03 = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 1000,
learning_rate = 0.1,
max_depth = 7,
min_child_weight = 0,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.95,
reg_alpha = 0.4,
nthread = 4,
scale_pos_weight = 1,
seed = 123
)
model_fit_xgb(xgb_03, df_train, predictor_lbls, target_lbl = y_label)
## Previous score
## RMSE : 7239.78
## R2 Score (Train): 0.996332
Our R2 score
changed agin increased a little, but this time RMSE
increased. On the other hand, num_boost_round
didn't change.
Lastly, we should lower the learning rate and add more trees. We will use the cv
function of XGBoost
to do the job again.
%%skip $tuning_disabled
xgb_04 = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 5000,
learning_rate = 0.01,
max_depth = 7,
min_child_weight = 0,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.95,
reg_alpha = 0.4,
nthread = 4,
scale_pos_weight = 1,
seed = 123
)
model_fit_xgb(xgb_04, df_train, predictor_lbls, target_lbl = y_label)
## Previous score
## RMSE : 10897.07
## R2 Score (Train): 0.994480
Now we can see a significant boost in performance and the effect of parameter tuning is clearer.
As we come to the end, I would like to share two key thoughts:
It is difficult to get a very big leap in performance by just using parameter tuning or slightly better models.
A significant jump can be obtained by other methods like feature engineering, creating ensemble of models, stacking, etc
xgb_Final = xgb.XGBRegressor(
objective = 'reg:linear',
n_estimators = 441,
learning_rate = 0.01,
max_depth = 7,
min_child_weight = 0,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.95,
reg_alpha = 0.4,
nthread = 4,
scale_pos_weight = 1,
seed = 123
)
model_fit_xgb(xgb_Final, df_train, predictor_lbls, target_lbl = y_label, useTrainCV = False)
## Previous score
## RMSE : 9721.88
## R2 Score (Train): 0.995075
df_score_x = df_score[predictor_lbls]
preds = xgb_Final.predict(df_score_x)
df_preds = pd.DataFrame({ 'row_id': df_score[id_col], 'evictions': preds })
df_preds['evictions'] = round(df_preds['evictions']).astype(int)
df_preds.head()
df_preds.to_csv('submission_01.csv')
df_tmp = pd.DataFrame({ 'row_id': df_score[id_col], 'evictions': preds })
df_tmp['evictions_rnd'] = round(df_tmp['evictions']).astype(int)
df_tmp['evictions_int'] = df_tmp['evictions'].astype(int)
df_tmp.head()
df_tmp.apply(lambda x: abs(x['evictions_rnd'] - x['evictions_int']), axis = 1).sum()