############# ## Code of Slides: Predictive Analytics in R ############# ####### Section: Regression Metrics ## trueVals <- c(10.2,-3,5.4,3,-43,21, 32.4,10.4,-65,23) preds <- c(13.1,-6,0.4,-1.3,-30,1.6, 3.9,16.2,-6,20.4) mse <- mean((trueVals-preds)^2) mse rmse <- sqrt(mse) rmse mae <- mean(abs(trueVals-preds)) mae nmse <- sum((trueVals-preds)^2) / sum((trueVals-mean(trueVals))^2) nmse ## nmae <- sum(abs(trueVals-preds)) / sum(abs(trueVals-mean(trueVals))) nmae mape <- mean(abs(trueVals-preds)/trueVals) mape smape <- 1/length(preds) * sum(abs(preds - trueVals) / (abs(preds)+abs(trueVals))) smape corr <- cor(trueVals,preds) corr ####### Section: Multiple Linear Regression ## library(DMwR2) data(algae) algae <- algae[-c(62,199),] # the 2 incomplete samples clean.algae <- knnImputation(algae) # lm() does not handle NAs! la1 <- lm(a1 ~ .,clean.algae[,1:12]) la1 ## summary(la1) ## final.la1 <- step(la1) ## summary(final.la1) ## clean.test.algae <- knnImputation(test.algae, k = 10, distData = clean.algae[, 1:11]) preds <- predict(final.la1,clean.test.algae) mean((preds-algae.sols$a1)^2) ## plot(algae.sols$a1,preds,main='Errors Scaterplot', ylab='Predicted Values',xlab='True Values') abline(0,1,col='red',lty=2) ####### Section: Support Vector Machines (SVMs) ## library(e1071) data(Glass,package='mlbench') tr <- Glass[1:200,] ts <- Glass[201:214,] s <- svm(Type ~ .,tr) predict(s,ts) ## ps <- predict(s,ts) table(ps,ts$Type) mc <- table(ps,ts$Type) error <- 100*(1-sum(diag(mc))/sum(mc)) error ## library(e1071) data(Boston,package='MASS') set.seed(1234) sp <- sample(1:nrow(Boston),354) tr <- Boston[sp,] ts <- Boston[-sp,] s <- svm(medv ~ .,tr,cost=10,epsilon=0.02) preds <- predict(s,ts) mean((ts$medv-preds)^2) ## plot(ts$medv,preds,main='Errors Scaterplot', ylab='Predictions',xlab='True') abline(0,1,col='red',lty=2) ####### Section: k-Nearest Neighbors ## library(class) set.seed(1234) sp <- sample(1:150,100) tr <- iris[sp,] ts <- iris[-sp,] nn3 <- knn(tr[,-5],ts[,-5],tr[,5],k=3) (mtrx <- table(nn3,ts$Species)) (err <- 1-sum(diag(mtrx))/sum(mtrx)) ## library(class) library(DMwR2) set.seed(1234) sp <- sample(1:150,100) tr <- iris[sp,] ts <- iris[-sp,] nn3 <- kNN(Species ~ .,tr,ts,k=3,stand=TRUE) (mtrx <- table(nn3,ts$Species)) (err <- 1-sum(diag(mtrx))/sum(mtrx)) ## trials <- c(1,3,5,7,11,13,15) nreps <- 10 res <- matrix(NA,nrow=length(trials),ncol=2) for(k in seq_along(trials)) { errs <- rep(0,nreps) for(r in 1:nreps) { sp <- sample(1:150,100) tr <- iris[sp,] ts <- iris[-sp,] nn3 <- kNN(Species ~ .,tr,ts,k=trials[k],norm=TRUE) mtrx <- table(nn3,ts$Species) errs[r] <- 1-sum(diag(mtrx))/sum(mtrx) } res[k,] <- c(mean(errs),sd(errs)) } dimnames(res) <- list(paste('k',trials,sep='='),c('avg','std')) res ## library(caret) data(Boston,package="MASS") set.seed(1234) sp <- sample(1:506,354) tr <- Boston[sp,] ts <- Boston[-sp,] tgt <- which(colnames(Boston) == "medv") nn3 <- knnreg(tr[,-tgt],tr[,tgt],k=3) pnn3 <- predict(nn3,ts[,-tgt]) (mse <- mean((pnn3-ts[,tgt])^2)) ####### Section: Tree-based Models ## library(DMwR2) library(rpart.plot) data(Glass,package='mlbench') ac <- rpartXse(Type ~ .,Glass) prp(ac,type=4,extra=101) ## tr <- Glass[1:200,] ts <- Glass[201:214,] ac <- rpartXse(Type ~ .,tr) predict(ac,ts) ## predict(ac,ts,type='class') ps <- predict(ac,ts,type='class') table(ps,ts$Type) mc <- table(ps,ts$Type) err <- 100*(1-sum(diag(mc))/sum(mc)) err ## library(DMwR2) library(rpart.plot) load('carInsurance.Rdata') d <- ins[,-1] ar <- rpartXse(normLoss ~ .,d) prp(ar,type=4,extra=101) ## library(DMwR2) library(rpart.plot) data(algae) d <- algae[,1:12] ar <- rpartXse(a1 ~ .,d) ## prp(ar,type=4,extra=101) ## tr <- d[1:150,] ts <- d[151:205,] arv <- rpartXse(normLoss ~ .,tr) preds <- predict(arv,ts) mae <- mean(abs(preds-ts$normLoss),na.rm=T) mae mape <- mean(abs(preds-ts$normLoss)/ts$normLoss,na.rm=T) mape ## tr <- d[1:150,] ts <- d[151:200,] ar <- rpartXse(a1 ~ .,tr) preds <- predict(ar,ts) mae <- mean(abs(preds-ts$a1)) mae cr <- cor(preds,ts$a1) cr ####### Section: Model Ensembles ####### Section: Model Ensembles and Random Forests ## simpleBagging <- function(form,data,model='rpartXse',nModels=100,...) { ms <- list() n <- nrow(data) for(i in 1:nModels) { tr <- sample(n,n,replace=T) ms[[i]] <- do.call(model,c(list(form,data[tr,]),...)) } ms } predict.simpleBagging <- function(models,test) { ps <- sapply(models,function(m) predict(m,test)) apply(ps,1,mean) } ## data(Boston,package='MASS') library(DMwR2) set.seed(123) trPerc <- 0.7 sp <- sample(1:nrow(Boston),as.integer(trPerc*nrow(Boston))) tr <- Boston[sp,] ts <- Boston[-sp,] m <- simpleBagging(medv ~ .,tr,nModels=300,se=0.5) ps <- predict.simpleBagging(m,ts) mean(abs(ps-ts$medv)) ## library(adabag) data(iris) set.seed(1234) trPerc <- 0.7 sp <- sample(1:nrow(iris),as.integer(trPerc*nrow(iris))) tr <- iris[sp,] ts <- iris[-sp,] m <- bagging(Species ~ ., tr,mfinal=50) ps <- predict(m,ts) ps$confusion ## Observed Class ## Predicted Class setosa versicolor virginica ## setosa 11 0 0 ## versicolor 0 20 1 ## virginica 0 1 12 ps$error ## [1] 0.04444444 ## randPreds <- function(tgtName,data,model='rpartXse', nVars=(ncol(data)-1)%/%2,nModels=20,...) { np <- ncol(data)-1 if (np <= nVars) stop(paste("Nro de colunas nos dados insuficiente para escolher", nVar,"variáveis")) tgtCol <- which(colnames(data) == tgtName) preds <- (1:ncol(data))[-tgtCol] ms <- list() for(i in 1:nModels) { cols <- sample(preds,nVars) form <- as.formula(paste(paste(names(data)[tgtCol],'~'), paste(names(data)[cols],collapse='+'))) ms[[i]] <- do.call(model,c(list(form,data),...)) } ms } predict.randPreds <- function(models,test) { ps <- sapply(models,function(m) predict(m,test)) apply(ps,1,mean) } ## data(Boston,package='MASS') library(DMwR2) set.seed(123) trPerc <- 0.7 sp <- sample(1:nrow(Boston),as.integer(trPerc*nrow(Boston))) tr <- Boston[sp,] ts <- Boston[-sp,] m <- randPreds("medv",tr,nModels=300,se=0.5) ps <- predict.randPreds(m,ts) mean(abs(ps-ts$medv)) ## library(randomForest) data(Boston,package="MASS") samp <- sample(1:nrow(Boston),354) tr <- Boston[samp,] ts <- Boston[-samp,] m <- randomForest(medv ~ ., tr) ps <- predict(m,ts) mean(abs(ts$medv-ps)) ## data(Glass,package='mlbench') set.seed(1234) sp <- sample(1:nrow(Glass),150) tr <- Glass[sp,] ts <- Glass[-sp,] m <- randomForest(Type ~ ., tr,ntree=3000) ps <- predict(m,ts) table(ps,ts$Type) mc <- table(ps,ts$Type) err <- 100*(1-sum(diag(mc))/sum(mc)) err ## data(Boston,package='MASS') library(randomForest) m <- randomForest(medv ~ ., Boston, importance=T) importance(m) ## varImpPlot(m,main="Feature Relevance Scores") ## library(adabag) data(iris) set.seed(1234) trPerc <- 0.7 sp <- sample(1:nrow(iris),as.integer(trPerc*nrow(iris))) tr <- iris[sp,] ts <- iris[-sp,] m <- boosting(Species ~ ., tr) ps <- predict(m,ts) ps$confusion ps$error ## library(adabag) data(BreastCancer,package="mlbench") set.seed(1234) trPerc <- 0.7 sp <- sample(1:nrow(BreastCancer),as.integer(trPerc*nrow(BreastCancer))) tr <- BreastCancer[sp,-1] ts <- BreastCancer[-sp,-1] m <- bagging(Class ~ ., tr,mfinal=100) ps <- predict(m,ts) ptr <- errorevol(m,tr) pts <- errorevol(m,ts) ## plot(ptr$error,type="l",xlab="nr.models",ylab="error",ylim=c(0,0.1)) lines(pts$error,col="red") ## library(gbm) data(Boston,package='MASS') set.seed(1234) trPerc <- 0.7 sp <- sample(1:nrow(Boston),as.integer(trPerc*nrow(Boston))) tr <- Boston[sp,] ts <- Boston[-sp,] m <- gbm(medv ~ .,distribution='gaussian',data=tr, n.trees=20000,verbose=F) ps <- predict(m,ts,type='response',n.trees=20000) mean(abs(ps-ts$medv)) ####### Section: Performance Estimation ####### Section: The Holdout Method ## library(DMwR2) set.seed(1234) data(Boston,package='MASS') ## random selection of the holdout trPerc <- 0.7 sp <- sample(1:nrow(Boston),as.integer(trPerc*nrow(Boston))) ## division in two samples tr <- Boston[sp,] ts <- Boston[-sp,] ## obtaining the model and respective predictions on the test set m <- rpartXse(medv ~.,tr) p <- predict(m,ts) ## evaluation mean((ts$medv-p)^2) ####### Section: Cross Validation ####### Section: Bootstrap ## data(Boston,package='MASS') nreps <- 200 scores <- vector("numeric",length=nreps) n <- nrow(Boston) set.seed(1234) for(i in 1:nreps) { # random sample with replacement sp <- sample(n,n,replace=TRUE) # data splitting tr <- Boston[sp,] ts <- Boston[-sp,] # model learning and prediction m <- lm(medv ~.,tr) p <- predict(m,ts) # evaluation scores[i] <- mean((ts$medv-p)^2) } # calculating means and standard errors summary(scores) ####### Section: The Infra-Structure of package performanceEstimation ## install.packages("performanceEstimation") ## library(devtools) # You need to install this package before! install_github("ltorgo/performanceEstimation",ref="develop") ## library(performanceEstimation) library(DMwR2) data(Boston,package='MASS') res <- performanceEstimation( PredTask(medv ~ .,Boston), Workflow("standardWF",learner="rpartXse"), EstimationTask(metrics="mse",method=CV(nReps=1,nFolds=10))) ## summary(res) ## plot(res) ## data(iris) PredTask(Species ~ ., iris) PredTask(Species ~ ., iris,"IrisDS",copy=TRUE) ## library(e1071) Workflow("standardWF",learner="svm",learner.pars=list(cost=10,gamma=0.1)) ## Workflow(learner="svm",learner.pars=list(cost=5)) ## data(algae,package="DMwR2") res <- performanceEstimation( PredTask(a1 ~ .,algae[,1:12],"A1"), Workflow(learner="lm",pre="centralImp",post="onlyPos"), EstimationTask("mse",method=CV()) # defaults to 1x10-fold CV ) ## library(e1071) data(Boston,package="MASS") res2 <- performanceEstimation( PredTask(medv ~ .,Boston), workflowVariants(learner="svm", learner.pars=list(cost=1:5,gamma=c(0.1,0.01))), EstimationTask(metrics="mse",method=CV())) ## summary(res2) ## getWorkflow("svm.v1",res2) topPerformers(res2) ## plot(res2) ## EstimationTask(metrics=c("F","rec","prec"),method=Bootstrap(nReps=100)) ## library(randomForest) library(e1071) res3 <- performanceEstimation( PredTask(medv ~ ., Boston), workflowVariants("standardWF", learner=c("rpartXse","svm","randomForest")), EstimationTask(metrics="mse",method=CV(nReps=2,nFolds=5))) ## rankWorkflows(res3,3) ## plot(res3) ## data(Glass,package='mlbench') res4 <- performanceEstimation( PredTask(Type ~ ., Glass), workflowVariants(learner="svm", # You may omit "standardWF" ! learner.pars=list(cost=c(1,10), gamma=c(0.1,0.01))), EstimationTask(metrics="err",method=Holdout(nReps=5,hldSz=0.3))) ## plot(res4) ## data(Glass,package='mlbench') data(iris) res5 <- performanceEstimation( c(PredTask(Type ~ ., Glass),PredTask(Species ~.,iris)), c(workflowVariants(learner="svm", learner.pars=list(cost=c(1,10), gamma=c(0.1,0.01))), workflowVariants(learner="rpartXse", learner.pars=list(se=c(0,0.5,1)), predictor.pars=list(type="class"))), EstimationTask(metrics="err",method=CV(nReps=3))) ## plot(res5) ## topPerformers(res5) topPerformer(res5,"err","Glass.Type") ## library(performanceEstimation) library(DMwR2) # because of rpartXse data(Boston,package="MASS") res <- performanceEstimation( PredTask(medv ~ .,Boston), workflowVariants(learner="rpartXse",learner.pars=list(se=c(0,0.5,1))), EstimationTask(metrics="mse",method=CV(nReps=3,nFolds=10))) ## pres <- pairedComparisons(res) pres$mse$WilcoxonSignedRank.test ## signifDiffs(pres,p.limit=0.05) ## library(performanceEstimation) library(e1071) library(randomForest) tgts <- 12:18 tasks <- c() for(t in tgts) tasks <- c(tasks, PredTask(as.formula(paste(colnames(algae)[t],'~ .')), algae[,c(1:11,t)], paste0("algaA",t-11), copy=TRUE)) res.algae <- performanceEstimation( tasks, workflowVariants(learner=c("svm","lm","randomForest"), pre="knnImp"), EstimationTask("mae",method=CV()) ) ## pres <- pairedComparisons(res.algae) pres$mae$F.test ## pres$mae$Nemenyi.test ## CDdiagram.Nemenyi(pres) ## pres <- pairedComparisons(res.algae,baseline="lm") pres$mae$BonferroniDunn.test ## CDdiagram.BD(pres)