I am trying to build a model to predict housing prices in R R version 4.0.2 (2020-06-22), with the latest updates.
The code runs fine without errors until I tried to call the predict function on the unknown data to generate a prediction.
My data came from the Kaggle Housing Competition Web site [https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data](Kaggle Data Source)
library(knitr)
library(ggplot2)
library(plyr)
library(dplyr)
library(corrplot)
library(caret)
library(gridExtra)
library(scales)
library(Rmisc)
library(ggrepel)
library(randomForest)
library(psych)
library(xgboost)
library(mltools)
library(glmnet)
trainAll <- read.csv("./data/train.csv", na.strings = "NA", fill = T,
strip.white = FALSE, blank.lines.skip = TRUE, colClasses = NA,
header = TRUE, nrows = -1, skipNul = TRUE, stringsAsFactors = FALSE)
testAll <- read.csv("./data/test.csv", na.strings = "NA", fill = T,
strip.white = FALSE, blank.lines.skip = TRUE, colClasses = NA,
header = TRUE, nrows = -1, skipNul = TRUE, stringsAsFactors = FALSE)
dim(trainAll);dim(testAll)
y_train <- log1p(trainAll$SalePrice)
test_labels <- testAll$Id
testAll$Id <- NULL
trainAll$Id <- NULL
testAll$SalePrice <- NA
df_All <- rbind(trainAll, testAll)
# str(df_train);dim(df_train)
x_train <- NULL
x_test <- NULL
process <- function(df_all)
df <- df_All %>%
select(c("MSSubClass", "LotArea", "LotConfig","Neighborhood","HouseStyle","OverallQual","YearBuilt","YearRemodAdd","YrSold",
"Exterior1st","Exterior2nd","Foundation","BsmtQual","BsmtFinType1","TotalBsmtSF","CentralAir","GrLivArea",
"BsmtFullBath","FullBath","BedroomAbvGr","KitchenQual","TotRmsAbvGrd","Fireplaces","GarageYrBlt"))
df$Remod <- ifelse(df$YearBuilt==df$YearRemodAdd, 0, 1)
df$Age <- as.numeric(df$YrSold)-as.numeric(df$YearRemodAdd)
df <- df %>%
select(-matches(c("YearBuilt","YrSold", "Remod","Remod")))
# Replace NAs with most frequent (Non-Numeric)
df[, sapply(df, function(x) !is.numeric(x))] <- apply(df[, sapply(df, function(x)
!is.numeric(x))], 2, function(x) {
x[is.na(x)] <- names(sort(table(x), decreasing = TRUE)[1]); x
# Replace NAs with most frequent (Numeric)
list_na <- colnames(df)[ apply(df, 2, anyNA) ]
# Create mean
for (na in list_na)
df[,na] <- ifelse(is.na(df[,na]), round(mean(df[,na], na.rm=TRUE), 0), df[,na])
df_t_int <- select_if(df, is.numeric)
df_t_chr <- select_if(df, is.character)
# dummify the Character data
dmy <- dummyVars(" ~ .", data = df_t_chr)
df_t_chr <- as.data.frame(predict(dmy, newdata = df_t_chr))
DFnumeric <- as.data.frame(cbind(df_t_int,df_t_chr))
# Remove the Skew and Normalize
for(i in 1:ncol(DFnumeric)){
if (abs(skew(DFnumeric[,i]))>0.8){
DFnumeric[,i] <- log(DFnumeric[,i] +1)
processed <- as.data.frame(scale(DFnumeric))
x_train <<- processed[1:1460,]
x_train <<- cbind(x_train, SalePrice = y_train)
#print(names(x_train))
x_test <<- processed[1461:2919,]
x_test <<- cbind(x_test, SalePrice = 0)
#print(names(x_test))
process(df_All)
My XGBoost section:
############################## XGBoost Tuned ####################################
dtrain <- xgb.DMatrix(data = as.matrix(x_train), label= x_train$SalePrice)
dtest <- xgb.DMatrix(data = as.matrix(x_test))
default_param<-list(
objective = "reg:squarederror",
booster = "gbtree",
eta=0.01, #default = 0.3
gamma=0,
max_depth=4, #default=6
min_child_weight=2, #default=1
subsample=1,
colsample_bytree=1
xgbcv <- xgb.cv( params = default_param, data = dtrain,
nrounds = 500, nfold = 5, showsd = T,
stratified = T, print_every_n = 40,
early_stopping_rounds = 10, maximize = F)
xgb_mod <- xgb.train(data = dtrain, params=default_param, nrounds = 454)
The xgb.train ran fine. If I use the same x_train data to predict, it works fine. However, it fails when I try to use the dtest data set:
XGBpred <- predict(xgb_mod, dtest)
Would you please advise on what I would need to do to proceed without errors?
Hi, Thank you for the response. I followed it, but something strange happened. All the predictions are the same:
[1] 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415
## [9] 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415
## [17] 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415
## [25] 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415 10.41415
Hi, I have an update:
I fixed the colnames issue by updating this block of code. I also corrected it above...
x_train <<- processed[1:1460,]
x_train <<- cbind(x_train, SalePrice = y_train)
#print(names(x_train))
#print("Updated x_test column names to match x_train")
x_test <<- processed[1461:2919,]
x_test <<- cbind(x_test, SalePrice = 0)
#print(names(x_test))
Nevertheless, the prediction of the unknown data still generate the same value for all label values.
### Run Predictions
```{r}
XGBpred <- expm1(predict(xgb_mod, dtrain))
print(head(XGBpred, 5))
Results:
[1] 184258.9 160477.3 196703.9 124443.1 219499.4
XGBactual <- expm1(predict(xgb_mod, dtest))
print(head(XGBactual, 5))
Results:
[1] 33327.03 33327.03 33327.03 33327.03 33327.03