This is the third part of a full pipeline [Exploratory data analysis, data cleaning and preprocessing, modeling, error analysis]. I recommend reading the first and second parts first.
Full project with data is available on my github.
#importing libraries
import pandas as pd
import numpy as np
import mpld3
mpld3.enable_notebook()
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from scipy import stats
import xgboost as xgb
import lightgbm as lgb
import os
import warnings
Read our saved clean dataset
DF = pd.read_csv('df_pre_processed.csv', header ='infer')
I delete 3 columns from the dataset. I could have deleted them earlier in the preprocessing part, but I thought that I might need them in error analysis part.
DF = DF.drop(['Neighborhood', 'District', 'rent_price_area'], axis = 1)
I'm going to try 3 ML algorithms: 2 boosted trees and 1 distance based model. Here is an article about machine learning algorithms.
The first algorithm is XGBoost.
def XGB(df):
#Split data into train and test set + label target value
y = df.rent_price
X = df.drop(['rent_price'], axis=1)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)
#Saving test_y rows index
test_index = test_y.index
my_imputer = SimpleImputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)
#Scaling features with Standart Scaler
scaler = StandardScaler()
train_sc =scaler.fit_transform(train_X)
test_sc = scaler.transform(test_X)
dtrain = xgb.DMatrix(train_sc, label=train_y)
dtest = xgb.DMatrix(test_sc, label=test_y)
#Default parameters was changed after applying 'hypermarameters tunning'.
params = {
'colsample_bytree': 1,
'eta': 0.01,
'eval_metric': 'mae',
'max_depth': 11,
'min_child_weight': 5,
'objective': 'reg:linear',
'subsample': 0.9}
#Fitting the model.
model = xgb.train(params, dtrain, num_boost_round= 3000, evals=[(dtest, "Test")], early_stopping_rounds=50)
#Predicting results
tuned_xgb = model.predict(dtest)
tunned_xgb_tr = model.predict(dtrain)
#Now we will create a new dataframe for evaluation using test_index
evaluation = DF.loc[test_index]
evaluation['predictions_tuned_Xgb'] = tuned_xgb.astype(int)
evaluation['residuals_tuned_Xgb_abs'] = abs(evaluation['rent_price'] - evaluation['predictions_tuned_Xgb'])
evaluation['residuals_tuned_Xgb'] = evaluation['rent_price'] - evaluation['predictions_tuned_Xgb']
evaluation['resid_percent_Xgb'] = evaluation['residuals_tuned_Xgb'] *100/evaluation['rent_price']
evaluation['resid_percent_Xgb_abs'] = evaluation['residuals_tuned_Xgb_abs'] *100/evaluation['rent_price']
return evaluation
The second is LGBoost
def LGB(df):
#Separating our data into features dataset x and our target dataset y
x=df.drop(['rent_price'], axis=1)
y=df.rent_price
#Now splitting our dataset into test and train
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25, random_state=0)
#Saving test_y rows index
test_index = y_test.index
# Scaling features with Standart Scaler
scaler = StandardScaler()
train_sc =scaler.fit_transform(x_train)
test_sc = scaler.transform(x_test)
# create dataset for lightgbm
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021
params['num_iterations'] = 4000
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'
params['sub_feature'] = 0.5
params['bagging_fraction'] = 0.85
params['num_leaves'] = 1000
params['min_hessian'] = 0.05
params['verbose'] = 0
# Fitting the model
gbm = lgb.train(params,
lgb_train,
num_boost_round=3000,
valid_sets=lgb_eval,
early_stopping_rounds=50)
#Predicting results
y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
#Saving test_y rows index
evaluation = DF.loc[test_index]
evaluation['predictions_lgb'] = y_pred.tolist()
evaluation['residuals_lgb_abs'] = abs(evaluation['rent_price'] - evaluation['predictions_lgb'])
evaluation['residuals_lgb'] = evaluation['rent_price'] - evaluation['predictions_lgb']
evaluation['resid_percent_lgb'] = evaluation['residuals_lgb'] *100/evaluation['rent_price']
evaluation['resid_percent_lgb_abs'] = evaluation['residuals_lgb_abs'] *100/evaluation['rent_price']
results = evaluation['resid_percent_lgb'].describe()
return evaluation
And the third is KNN
def KNN(df):
#Separating our data into features dataset x and our target dataset y
X=df.drop(['rent_price'], axis=1)
y=df.rent_price
#Now splitting our dataset into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)
#Saving test_y rows index
test_index = y_test.index
steps = [('scaler', MinMaxScaler()),
('model', KNeighborsRegressor())]
pipe = Pipeline(steps)
params = {'model__n_neighbors': range(2, 3),
'model__metric': ['minkowski', 'manhattan'],
'scaler': [MinMaxScaler(), MaxAbsScaler()]}
gs = GridSearchCV(pipe, param_grid=params, cv=10,
return_train_score=False, n_jobs=4,
scoring = 'neg_mean_squared_error')
gs.fit(X_train, y_train)
gs_results = pd.DataFrame(gs.cv_results_)
y_pred_train = gs.predict(X_train)
y_pred_test = gs.predict(X_test)
train_MAE = mean_absolute_error((y_train), (y_pred_train))
test_MAE = mean_absolute_error((y_test),(y_pred_test))
print(f'Train MAE: {train_MAE:.3f}')
print(f'Test MAE: {train_MAE:.3f}')
#Saving test_y rows index
evaluation = DF.loc[test_index]
evaluation['predictions_knn'] = y_pred_test.tolist()
evaluation['residuals_knn_abs'] = abs(evaluation['rent_price'] - evaluation['predictions_knn'])
evaluation['residuals_knn'] = evaluation['rent_price'] - evaluation['predictions_knn']
evaluation['resid_percent_knn_abs'] = evaluation['residuals_knn_abs'] *100/evaluation['rent_price']
evaluation['resid_percent_knn'] = evaluation['residuals_knn'] *100/evaluation['rent_price']
results = evaluation['resid_percent_knn'].describe()
return evaluation
I will save all models in separate dataframes for error analysis.
results_XGB = XGB(df)
results_LGB = LGB(df)
results_KNN = KNN(df)
Saving csv file with all predictions
results_XGB.to_csv('results_XGB.csv', index=False)
results_LGB.to_csv('results_LGB.csv', index=False)
results_KNN.to_csv('results_KNN.csv', index=False)
This is the end of Modeling. The main goal of this part was to try different ML algorithms and compare the results.
Read the fourth part
Yorumlar