Madrid rental price prediction. Part III

This is the third part of a full pipeline [Exploratory data analysis, data cleaning and preprocessing, modeling, error analysis]. I recommend reading the first and second parts first.

Full project with data is available on my github.

#importing libraries
import pandas as pd
import numpy as np
import mpld3 
mpld3.enable_notebook()
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from scipy import stats

import xgboost as xgb
import lightgbm as lgb

import os
import warnings

Read our saved clean dataset

DF = pd.read_csv('df_pre_processed.csv', header ='infer')

I delete 3 columns from the dataset. I could have deleted them earlier in the preprocessing part, but I thought that I might need them in error analysis part.

DF = DF.drop(['Neighborhood', 'District', 'rent_price_area'], axis = 1)

I'm going to try 3 ML algorithms: 2 boosted trees and 1 distance based model. Here is an article about machine learning algorithms.

The first algorithm is XGBoost.


def XGB(df):
    #Split data into train and test set + label target value
    y = df.rent_price
    X = df.drop(['rent_price'], axis=1)
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)
    #Saving test_y rows index
    test_index = test_y.index
    my_imputer = SimpleImputer()
    train_X = my_imputer.fit_transform(train_X)
    test_X = my_imputer.transform(test_X)

    #Scaling features with Standart Scaler
    scaler = StandardScaler()
    train_sc =scaler.fit_transform(train_X)
    test_sc = scaler.transform(test_X)

    dtrain = xgb.DMatrix(train_sc, label=train_y)
    dtest = xgb.DMatrix(test_sc, label=test_y)
    #Default parameters was changed after applying 'hypermarameters tunning'.
    params = {
    'colsample_bytree': 1,
    'eta': 0.01,
    'eval_metric': 'mae',
    'max_depth': 11,
    'min_child_weight': 5,
    'objective': 'reg:linear',
    'subsample': 0.9}
    #Fitting the model.
    model = xgb.train(params, dtrain, num_boost_round= 3000, evals=[(dtest, "Test")], early_stopping_rounds=50)
    
    #Predicting results
    tuned_xgb = model.predict(dtest)
    tunned_xgb_tr = model.predict(dtrain)
    #Now we will create a new dataframe for evaluation using test_index
    evaluation = DF.loc[test_index]
    evaluation['predictions_tuned_Xgb'] = tuned_xgb.astype(int)
    evaluation['residuals_tuned_Xgb_abs'] = abs(evaluation['rent_price'] - evaluation['predictions_tuned_Xgb'])
    evaluation['residuals_tuned_Xgb'] = evaluation['rent_price'] - evaluation['predictions_tuned_Xgb']
    evaluation['resid_percent_Xgb'] = evaluation['residuals_tuned_Xgb'] *100/evaluation['rent_price']
    evaluation['resid_percent_Xgb_abs'] = evaluation['residuals_tuned_Xgb_abs'] *100/evaluation['rent_price']
    
    return evaluation

The second is LGBoost

def LGB(df):
    #Separating our data into features dataset x and our target dataset y 
    x=df.drop(['rent_price'], axis=1)
    y=df.rent_price 
    #Now splitting our dataset into test and train 
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25, random_state=0)
    
    #Saving test_y rows index
    test_index = y_test.index

    # Scaling features with Standart Scaler
    scaler = StandardScaler()
    train_sc =scaler.fit_transform(x_train)
    test_sc = scaler.transform(x_test)

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
    
    # specify your configurations as a dict
    params = {}
    params['max_bin'] = 10
    params['learning_rate'] = 0.0021
    params['num_iterations'] = 4000
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'l1'          
    params['sub_feature'] = 0.5      
    params['bagging_fraction'] = 0.85 
    params['num_leaves'] = 1000            
    params['min_hessian'] = 0.05     
    params['verbose'] = 0

    # Fitting the model
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=3000,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=50)
    
    #Predicting results
    y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
    
    #Saving test_y rows index
    evaluation = DF.loc[test_index]
    evaluation['predictions_lgb'] = y_pred.tolist()
    evaluation['residuals_lgb_abs'] = abs(evaluation['rent_price'] - evaluation['predictions_lgb'])
    evaluation['residuals_lgb'] = evaluation['rent_price'] - evaluation['predictions_lgb']
    evaluation['resid_percent_lgb'] = evaluation['residuals_lgb'] *100/evaluation['rent_price']
    evaluation['resid_percent_lgb_abs'] = evaluation['residuals_lgb_abs'] *100/evaluation['rent_price']

    results = evaluation['resid_percent_lgb'].describe()
    
    return evaluation

And the third is KNN

def KNN(df):
    #Separating our data into features dataset x and our target dataset y 
    X=df.drop(['rent_price'], axis=1)
    y=df.rent_price 
    #Now splitting our dataset into test and train 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)  
    
    #Saving test_y rows index
    test_index = y_test.index
    
    steps = [('scaler', MinMaxScaler()), 
             ('model', KNeighborsRegressor())]
    pipe = Pipeline(steps)


    params = {'model__n_neighbors': range(2, 3),
              'model__metric': ['minkowski', 'manhattan'], 
              'scaler': [MinMaxScaler(), MaxAbsScaler()]}        

    gs = GridSearchCV(pipe, param_grid=params, cv=10, 
                      return_train_score=False, n_jobs=4, 
                      scoring = 'neg_mean_squared_error')

    gs.fit(X_train, y_train)
    gs_results = pd.DataFrame(gs.cv_results_)
    
    y_pred_train = gs.predict(X_train)
    y_pred_test = gs.predict(X_test)

    train_MAE = mean_absolute_error((y_train), (y_pred_train))
    test_MAE = mean_absolute_error((y_test),(y_pred_test))
    print(f'Train MAE: {train_MAE:.3f}')
    print(f'Test MAE: {train_MAE:.3f}')
    
        #Saving test_y rows index
    evaluation = DF.loc[test_index]
    evaluation['predictions_knn'] = y_pred_test.tolist()
    evaluation['residuals_knn_abs'] = abs(evaluation['rent_price'] - evaluation['predictions_knn'])
    evaluation['residuals_knn'] = evaluation['rent_price'] - evaluation['predictions_knn']
    evaluation['resid_percent_knn_abs'] = evaluation['residuals_knn_abs'] *100/evaluation['rent_price']
    evaluation['resid_percent_knn'] = evaluation['residuals_knn'] *100/evaluation['rent_price']
    results = evaluation['resid_percent_knn'].describe()
    
    return evaluation

I will save all models in separate dataframes for error analysis.

results_XGB  = XGB(df)
results_LGB  = LGB(df)
results_KNN  = KNN(df)

Saving csv file with all predictions

results_XGB.to_csv('results_XGB.csv', index=False)
results_LGB.to_csv('results_LGB.csv', index=False)
results_KNN.to_csv('results_KNN.csv', index=False)

This is the end of Modeling. The main goal of this part was to try different ML algorithms and compare the results.

Read the fourth part

Madrid rental price prediction. Part III - [Modeling]

The first algorithm is XGBoost.

The second is LGBoost

And the third is KNN

Recent Posts

Comments