library(MASS) library(rpart) head(fgl) rp.obj <- rpart(type~Al+Ba,method="class",data=fgl) plot(rp.obj) text(rp.obj,digits=3) printcp(rp.obj) table(fgl$type) 1-table(fgl$type)/dim(fgl)[1] # root node error = apparent error at the root node (no splitting) # rel error = apparent error as a fraction of the root node error # xerror = CV error as a fraction of the root node error # First split I <- fgl$Ba >= 0.335 mean(fgl$type[I]=="Head") table(fgl$type[I==1]) # apparent accuracy of first split (sum(fgl$type[I==F]=="WinNF")+sum(fgl$type[I==T]=="Head"))/dim(fgl)[1] rp.obj <- rpart(type~Al+Ba,method="class",data=fgl,parms=list(split="gini")) printcp(rp.obj) rp.obj <- rpart(type~Al+Ba,method="class",data=fgl,parms=list(split="information")) #use deviance printcp(rp.obj) plot(rp.obj) text(rp.obj,digits=3) rp.obj <- rpart(type~.,method="class",data=fgl) plotcp(rp.obj) printcp(rp.obj) plot(rp.obj) text(rp.obj,digits=3) rp.obj <- prune(rp.obj,cp=0.04) printcp(rp.obj) plot(rp.obj) text(rp.obj,digits=3) rp.obj <- rpart(type~.,method="class",data=fgl,cp=0.04) printcp(rp.obj) plot(rp.obj) text(rp.obj,digits=3) ############################## rp.obj <- rpart(type~.,method="class",data=fgl,parms=list(split='gini'), control=list(misplit=10,minbucket=5,cp=0.005) plotcp(rp.obj) printcp(rp.obj) plot(rp.obj) text(rp.obj,digits=3) rp.obj <- rpart(type~.,method="class",data=fgl,parms=list(split='information'), control=list(misplit=10,minbucket=5,cp=0.005)) plotcp(rp.obj) printcp(rp.obj) plot(rp.obj) text(rp.obj,digits=3) library(e1071) tune.obj <- tune.rpart(type~.,data=fgl, minsplit = c(5,10,15), minbucket = c(5,10,15), cp = c(0.005,0.01)) summary(tune.obj) rp.obj <- rpart(type~.,method="class",data=fgl, control=list(misplit=10,minbucket=10,cp=0.01)) plotcp(rp.obj) printcp(rp.obj) plot(rp.obj) text(rp.obj,digits=3) n <- dim(fgl)[1] probs <- predict(rp.obj) class.pred <- rep(0,n) class.names <- colnames(probs) for (i in 1:n){ class.pred[i] <- class.names[which(probs[i,]==max(probs[i,]))]} tabl <- table(class.pred,fgl$type)[c(5,6,4,1,3,2),] 100*diag(tabl/rowSums(tabl)) rowSums(tabl) 100*sum(diag(tabl))/sum(tabl) tabl2 <- rbind(cbind(tabl,rowSums(tabl)),c(colSums(tabl),sum(sum(tabl)))) rownames(tabl2) <- c(rownames(tabl),"Total") colnames(tabl2) <- c(colnames(tabl),"Total") "Rows = actual columns = predicted" tabl2 ##################### CV r <- 100 nt <- floor(0.9*n) for (i in 1:r){ s <- sample(1:n,size=nt) u <- rep(0,nt) ct <- 1 for (j in 1:n){ if (sum(s==j)==0) { u[ct] <- j ct <- ct + 1} } rpart.obj <- rpart(type~ .,data=fgl,subset=s,method="class", control=list(misplit=10,minbucket=5,cp=0.01)) predictions <- predict(rpart.obj,newdat=fgl,type="class") if (i==1) save.m <- cbind(fgl$type[-s],predictions[-s]) if (i!=1) save.m <- rbind(save.m,cbind(fgl$type[-s],predictions[-s])) print(c("Loop ",i)) } tabl <- table(save.m[,1],save.m[,2]) options(digits=3) 100*diag(tabl/rowSums(tabl)) rowSums(tabl) 100*sum(diag(tabl))/sum(tabl) tabl2 <- rbind(cbind(tabl,rowSums(tabl)),c(colSums(tabl),sum(sum(tabl)))) rownames(tabl2) <- c(rownames(tabl),"Total") colnames(tabl2) <- c(colnames(tabl),"Total") "Rows = actual columns = predicted" tabl2 ##################### try k-NN ################ tune.obj <- tune.knn(x=fgl[,1:9], y=fgl$type, k = 1:5, tunecontrol = tune.control(sampling = "cross",cross=10)) summary(tune.obj) r <- 100 nt <- floor(0.9*n) for (i in 1:r){ s <- sample(1:n,size=nt) u <- rep(0,nt) ct <- 1 for (j in 1:n){ if (sum(s==j)==0) { u[ct] <- j ct <- ct + 1} } predictions <- knn(train=fgl[s,1:9],test=fgl[,1:9],cl=fgl[s,10],k=1) if (i==1) save.m <- cbind(fgl$type[-s],predictions[-s]) if (i!=1) save.m <- rbind(save.m,cbind(fgl$type[-s],predictions[-s])) print(c("Loop ",i)) } tabl <- table(save.m[,1],save.m[,2]) options(digits=3) 100*diag(tabl/rowSums(tabl)) rowSums(tabl) 100*sum(diag(tabl))/sum(tabl) tabl2 <- rbind(cbind(tabl,rowSums(tabl)),c(colSums(tabl),sum(sum(tabl)))) rownames(tabl2) <- c(rownames(tabl),"Total") colnames(tabl2) <- c(colnames(tabl),"Total") "Rows = actual columns = predicted" tabl2