Data Mining

Map > Problem Definition > Data Preparation > Data Exploration > Modeling > Evaluation > Classification

Evaluation - Classification

I. Data Preparation

1- Load libraries

library(data.table)
library(formattable)
library(plotrix)
library(dplyr)
library(Rtsne)
library(MASS)
library(xgboost)
library(factoextra)
library(caTools)
library(pROC)
library(caret)
library(gains)
library(lift)

2- Read Expressions file

df <- read.csv("GSE74763_rawlog_expr.csv")
df2 <- df[,-1]
rownames(df2) <- df[,1]
expr <- transpose(df2)
rownames(expr) <- colnames(df2)
colnames(expr) <- rownames(df2)
dim(expr)

3- Read Samples file

targets <- read.csv("GSE74763_rawlog_targets.csv")
colnames(targets)
dim(targets)

4- Merge Expressions with Samples

data <- cbind(expr, targets)
colnames(data)
dim(data)

II. Splitting Data into Training and Test Sets

set.seed(101)

d1 <- data
groups <- as.fumeric(data$target)

sample = sample.split(d1$target, SplitRatio = .8)
train = subset(d1, sample == TRUE)
test = subset(d1, sample == FALSE)

dim(train)
dim(test)

III. Logistic Regression

#train
model <- glm(target ~ P000833+P007414+P002449, data = train, family=binomial(logit), maxit = 100)
print(model)

#test
pb <- predict(model, test, type="response")
pb <- as.data.frame(pb)

IV. Confusion Matrix

pc <- NULL
pc <- ifelse(pb$pb > 0.5,"1","0")
summary(pc)
xtab <- table(pc, test$target)
caret::confusionMatrix(xtab, positive = "1")

V. ROC Chart

pb <- NULL
pb <- predict(model, test, type="response")
pb <- as.data.frame(pb)
labels <- test$target
scores <- pb$pb

plot(roc(labels, scores, direction="<"), col="blue", lwd=3, main="ROC Chart")

auc(roc(labels, scores, direction="<"))

VI. Gain and Lift Charts

pb <- NULL
pb <- predict(model, test, type="response")
pb <- as.data.frame(pb)
labels <- test$target
scores <- pb$pb

gains(labels, scores, groups=10)

plot(gains(labels, scores, groups=10))

plotLift(scores, labels, cumulative = TRUE, n.buckets = 10)

VII. Bioada SmartArray

Watch this video to learn how you can perform classification model evaluation using Bioada SmartArray significantly faster and easier.