top of page

Madrid rental price prediction. Part III - [Modeling]

This is the third part of a full pipeline [Exploratory data analysis, data cleaning and preprocessing, modeling, error analysis]. I recommend reading the first and second parts first.


Full project with data is available on my github.

 
#importing libraries
import pandas as pd
import numpy as np
import mpld3 
mpld3.enable_notebook()
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from scipy import stats

import xgboost as xgb
import lightgbm as lgb
import os
import warnings

Read our saved clean dataset

DF = pd.read_csv('df_pre_processed.csv', header ='infer')

I delete 3 columns from the dataset. I could have deleted them earlier in the preprocessing part, but I thought that I might need them in error analysis part.

DF = DF.drop(['Neighborhood', 'District', 'rent_price_area'], axis = 1)

I'm going to try 3 ML algorithms: 2 boosted trees and 1 distance based model. Here is an article about machine learning algorithms.


The first algorithm is XGBoost.


def XGB(df):
    #Split data into train and test set + label target value
    y = df.rent_price
    X = df.drop(['rent_price'], axis=1)
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)
    #Saving test_y rows index
    test_index = test_y.index
    my_imputer = SimpleImputer()
    train_X = my_imputer.fit_transform(train_X)
    test_X = my_imputer.transform(test_X)

    #Scaling features with Standart Scaler
    scaler = StandardScaler()
    train_sc =scaler.fit_transform(train_X)
    test_sc = scaler.transform(test_X)

    dtrain = xgb.DMatrix(train_sc, label=train_y)
    dtest = xgb.DMatrix(test_sc, label=test_y)
    #Default parameters was changed after applying 'hypermarameters tunning'.
    params = {
    'colsample_bytree': 1,
    'eta': 0.01,
    'eval_metric': 'mae',
    'max_depth': 11,
    'min_child_weight': 5,
    'objective': 'reg:linear',
    'subsample': 0.9}
    #Fitting the model.
    model = xgb.train(params, dtrain, num_boost_round= 3000, evals=[(dtest, "Test")], early_stopping_rounds=50)
    
    #Predicting results
    tuned_xgb = model.predict(dtest)
    tunned_xgb_tr = model.predict(dtrain)
    #Now we will create a new dataframe for evaluation using test_index
    evaluation = DF.loc[test_index]
    evaluation['predictions_tuned_Xgb'] = tuned_xgb.astype(int)
    evaluation['residuals_tuned_Xgb_abs'] = abs(evaluation['rent_price'] - evaluation['predictions_tuned_Xgb'])
    evaluation['residuals_tuned_Xgb'] = evaluation['rent_price'] - evaluation['predictions_tuned_Xgb']
    evaluation['resid_percent_Xgb'] = evaluation['residuals_tuned_Xgb'] *100/evaluation['rent_price']
    evaluation['resid_percent_Xgb_abs'] = evaluation['residuals_tuned_Xgb_abs'] *100/evaluation['rent_price']
    
    return evaluation

The second is LGBoost

def LGB(df):
    #Separating our data into features dataset x and our target dataset y 
    x=df.drop(['rent_price'], axis=1)
    y=df.rent_price 
    #Now splitting our dataset into test and train 
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25, random_state=0)
    
    #Saving test_y rows index
    test_index = y_test.index

    # Scaling features with Standart Scaler
    scaler = StandardScaler()
    train_sc =scaler.fit_transform(x_train)
    test_sc = scaler.transform(x_test)

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
    
    # specify your configurations as a dict
    params = {}
    params['max_bin'] = 10
    params['learning_rate'] = 0.0021
    params['num_iterations'] = 4000
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'l1'          
    params['sub_feature'] = 0.5      
    params['bagging_fraction'] = 0.85 
    params['num_leaves'] = 1000            
    params['min_hessian'] = 0.05     
    params['verbose'] = 0

    # Fitting the model
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=3000,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=50)
    
    #Predicting results
    y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
    
    #Saving test_y rows index
    evaluation = DF.loc[test_index]
    evaluation['predictions_lgb'] = y_pred.tolist()
    evaluation['residuals_lgb_abs'] = abs(evaluation['rent_price'] - evaluation['predictions_lgb'])
    evaluation['residuals_lgb'] = evaluation['rent_price'] - evaluation['predictions_lgb']
    evaluation['resid_percent_lgb'] = evaluation['residuals_lgb'] *100/evaluation['rent_price']
    evaluation['resid_percent_lgb_abs'] = evaluation['residuals_lgb_abs'] *100/evaluation['rent_price']

    results = evaluation['resid_percent_lgb'].describe()
    
    return evaluation

And the third is KNN


def KNN(df):
    #Separating our data into features dataset x and our target dataset y 
    X=df.drop(['rent_price'], axis=1)
    y=df.rent_price 
    #Now splitting our dataset into test and train 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)  
    
    #Saving test_y rows index
    test_index = y_test.index
    
    steps = [('scaler', MinMaxScaler()), 
             ('model', KNeighborsRegressor())]
    pipe = Pipeline(steps)


    params = {'model__n_neighbors': range(2, 3),
              'model__metric': ['minkowski', 'manhattan'], 
              'scaler': [MinMaxScaler(), MaxAbsScaler()]}        

    gs = GridSearchCV(pipe, param_grid=params, cv=10, 
                      return_train_score=False, n_jobs=4, 
                      scoring = 'neg_mean_squared_error')

    gs.fit(X_train, y_train)
    gs_results = pd.DataFrame(gs.cv_results_)
    
    y_pred_train = gs.predict(X_train)
    y_pred_test = gs.predict(X_test)

    train_MAE = mean_absolute_error((y_train), (y_pred_train))
    test_MAE = mean_absolute_error((y_test),(y_pred_test))
    print(f'Train MAE: {train_MAE:.3f}')
    print(f'Test MAE: {train_MAE:.3f}')
    
        #Saving test_y rows index
    evaluation = DF.loc[test_index]
    evaluation['predictions_knn'] = y_pred_test.tolist()
    evaluation['residuals_knn_abs'] = abs(evaluation['rent_price'] - evaluation['predictions_knn'])
    evaluation['residuals_knn'] = evaluation['rent_price'] - evaluation['predictions_knn']
    evaluation['resid_percent_knn_abs'] = evaluation['residuals_knn_abs'] *100/evaluation['rent_price']
    evaluation['resid_percent_knn'] = evaluation['residuals_knn'] *100/evaluation['rent_price']
    results = evaluation['resid_percent_knn'].describe()
    
    return evaluation

I will save all models in separate dataframes for error analysis.

results_XGB  = XGB(df)
results_LGB  = LGB(df)
results_KNN  = KNN(df)

Saving csv file with all predictions

results_XGB.to_csv('results_XGB.csv', index=False)
results_LGB.to_csv('results_LGB.csv', index=False)
results_KNN.to_csv('results_KNN.csv', index=False)

This is the end of Modeling. The main goal of this part was to try different ML algorithms and compare the results.


Read the fourth part


Yorumlar


bottom of page