Data Analysis of Patients with Risk of Presenting CHD, using Machine Learning Techniques

1. Loading libraries and source data

  # Loading R libraries
  library(e1071)
  library(kknn)
  library(MASS)
  library(class)
  library(rpart)
  library(randomForest)
  library(ada)
  
  # Loading K-folds library
  library(caret)
  
  # Loading FactoMineR library
  library(FactoMineR)

  # Loading plotting libraries
  library(ggplot2)

Loading the dataset:

  file_path = "ML_CHD_Prediction/data/SAheart.csv"
  saha_data <- read.csv(file_path, header = TRUE, sep = ',', dec = '.')

2. Descriptive Data Analysis

2.1. Summary of dataset

  # Show dataframe dim(rows, cols)
  dim(saha_data)

## [1] 462  10

  # Note: famhist -> Present = 1, Absent = 0
  head(saha_data, n = 10)

##    sbp tobacco  ldl adiposity famhist typea obesity alcohol age chd
## 1  160   12.00 5.73     23.11 Present    49   25.30   97.20  52 Yes
## 2  144    0.01 4.41     28.61  Absent    55   28.87    2.06  63 Yes
## 3  118    0.08 3.48     32.28 Present    52   29.14    3.81  46  No
## 4  170    7.50 6.41     38.03 Present    51   31.99   24.26  58 Yes
## 5  134   13.60 3.50     27.78 Present    60   25.99   57.34  49 Yes
## 6  132    6.20 6.47     36.21 Present    62   30.77   14.14  45  No
## 7  142    4.05 3.38     16.20  Absent    59   20.81    2.62  38  No
## 8  114    4.08 4.59     14.60 Present    62   23.11    6.72  58 Yes
## 9  114    0.00 3.83     19.40 Present    49   24.86    2.49  29  No
## 10 132    0.00 5.80     30.96 Present    69   30.11    0.00  53 Yes

2.2. Boxplots are created to identify the outliers for each variable

  # famhist and chd variables are not analyzed, because they are dichotomous variables
  boxplot(x = saha_data[, c(-5, -10)], range = 2, border = c("blue", "green", "black", "orange"), las=2, cex.axis=0.9, cex.names=0.9)

  # The Scatterplot is created with the correlation matrices
  pairs(saha_data)

2.3. PCA and Correlation circle

  # Apply PCA
  pca.res <- PCA(saha_data[, c(-5, -10)], scale.unit = TRUE, graph = FALSE)
  pca.res$eig

##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1  2.8104059              35.130074                          35.13007
## comp 2  1.1971314              14.964143                          50.09422
## comp 3  1.0584753              13.230941                          63.32516
## comp 4  0.8483607              10.604509                          73.92967
## comp 5  0.7620401               9.525501                          83.45517
## comp 6  0.6709284               8.386605                          91.84177
## comp 7  0.4772242               5.965302                          97.80708
## comp 8  0.1754340               2.192925                         100.00000

  # The Correlation circle is plotted - Only variables that have cos2 > 0.25 (25%)
  plot(pca.res, axes=c(1, 2), choix="var", col.var="blue",new.plot=TRUE, select="cos2 0.25")

3. Base Reference Error

The Base Reference Error is calculated.

  # Get rows number
  nRows <- nrow(saha_data)
  
  # Calculate the amount of Yes and No for the CHD variable
  nYes <- sum(saha_data$chd == "Yes")
  nNo <- nRows - nYes
  
  # The presence percentage of the Yes class is calculated
  nYesClass <- nYes * 100 / nRows
  cat("# Yes: ", nYes, ", # No: ", nNo, ", % Yes Class: ", nYesClass)

## # Yes:  160 , # No:  302 , % Yes Class:  34.63203

4. Training and Calibration of the model

The training and calibration of the final model is carried out. The Cross-Validation technique is used to correctly estimate the accuracy of the models (with k = 10).

  # Get CHD index
ixTypeVar = ncol(saha_data)

# Cross-Validation iters
nCV <- 10

# K-folds number
nFolds <- 10

# Variables to store the detection of CHD = Yes
detection.yes.svm <- rep(0, nCV)
detection.yes.knn <- rep(0, nCV)
detection.yes.bayes <- rep(0, nCV)
detection.yes.dtree <- rep(0, nCV)
detection.yes.forest <- rep(0, nCV)
detection.yes.adaboost <- rep(0, nCV)

# Variables to store the detection of CHD = No
detection.no.svm <- rep(0, nCV)
detection.no.knn <- rep(0, nCV)
detection.no.bayes <- rep(0, nCV)
detection.no.dtree <- rep(0, nCV)
detection.no.forest <- rep(0, nCV)
detection.no.adaboost <- rep(0, nCV)

# Variables to store the average global error
error.global.svm <- rep(0, nCV)
error.global.knn <- rep(0, nCV)
error.global.bayes <- rep(0, nCV)
error.global.dtree <- rep(0, nCV)
error.global.forest <- rep(0, nCV)
error.global.adaboost <- rep(0, nCV)

# Start cross-validation
for(i in 1:nCV) {
  
  # Groups are created (for this case 10)
  groups <- createFolds(1:nRows, k = nFolds)
  
  # Internal counters of the prediction of the Yes
  yes.svm <- 0
  yes.knn <- 0
  yes.bayes <- 0
  yes.dtree <- 0
  yes.forest <- 0
  yes.adaboost <- 0
  
  # Internal counters of the prediction of the No
  no.svm <- 0
  no.knn <- 0
  no.bayes <- 0
  no.dtree <- 0
  no.forest <- 0
  no.adaboost <- 0
  
  # Internal error counters of the models
  error.svm <- 0
  error.knn <- 0
  error.bayes <- 0
  error.dtree <- 0
  error.forest <- 0
  error.adaboost <- 0
  
  # Loop to perform the cross-validation
  for(k in 1:nFolds) {
    
    # The learning and testing datasets are generated
    sample <- groups[[k]]
    tlearning <- saha_data[-sample, ]
    ttesting <- saha_data[sample, ]
    
    # 1. A model is generated with the Vector Support Machines method
    model <- svm(chd~., data = tlearning, kernel = "linear")
    prediction <- predict(model, ttesting)
    actual <- ttesting[,ixTypeVar]
    MC <- table(actual, prediction)
    
    # The quality of the model is saved
    yes.svm <- yes.svm + MC[2, 2]
    no.svm <- no.svm + MC[1, 1]
    error.svm <- error.svm + (1 - (sum(diag(MC)) / sum(MC))) * 100
    
    # 2. A model is generated with the Nearest Neighbors method with K = 5
    model <- train.kknn(chd~.,data = tlearning, kmax = 5)
    prediction <- predict(model, ttesting[,-ixTypeVar])
    actual <- ttesting[,ixTypeVar]
    MC <- table(actual, prediction)
    
    # The quality of the model is saved
    yes.knn <- yes.knn + MC[2, 2]
    no.knn <- no.knn + MC[1, 1]
    error.knn <- error.knn + (1 - (sum(diag(MC)) / sum(MC))) * 100
    
    # 3. A model is generated with the Naive Bayes method
    model <- naiveBayes(chd~., data = tlearning)
    prediction <- predict(model, ttesting[,-ixTypeVar])
    actual <- ttesting[,ixTypeVar]
    MC <- table(actual, prediction)
    
    # The quality of the model is saved
    yes.bayes <- yes.bayes + MC[2, 2]
    no.bayes <- no.bayes + MC[1, 1]
    error.bayes <- error.bayes + (1 - (sum(diag(MC)) / sum(MC))) * 100
    
    # 4. A model is generated with the Decision Trees method
    model <- rpart(chd~., data = tlearning)
    prediction <- predict(model, ttesting, type='class')
    actual <- ttesting[,ixTypeVar]
    MC <- table(actual, prediction)
    
    # The quality of the model is saved
    yes.dtree <- yes.dtree + MC[2, 2]
    no.dtree <- no.dtree + MC[1, 1]
    error.dtree <- error.dtree + (1 - (sum(diag(MC)) / sum(MC))) * 100
    
    # 5. A model is generated with the Random Forest method with 300 Trees
    model <- randomForest(chd~., data = tlearning, importance = TRUE, ntree = 300)
    prediction <- predict(model, ttesting[,-ixTypeVar])
    actual <- ttesting[,ixTypeVar]
    MC <- table(actual, prediction)
    
    # The quality of the model is saved
    yes.forest <- yes.forest + MC[2, 2]
    no.forest <- no.forest + MC[1, 1]
    error.forest <- error.forest + (1 - (sum(diag(MC)) / sum(MC))) * 100
    
    # 6. A model is generated with the ADA Boosting method
    model <- ada(chd~., data = tlearning, iter = 50, nu = 1, type="real")
    prediction <- predict(model, ttesting[,-ixTypeVar])
    actual <- ttesting[,ixTypeVar]
    MC <- table(actual, prediction)
    
    # The quality of the model is saved
    yes.adaboost <- yes.adaboost + MC[2, 2]
    no.adaboost <- no.adaboost + MC[1, 1]
    error.adaboost <- error.adaboost + (1 - (sum(diag(MC)) / sum(MC))) * 100
  }
  
  # The amount of Yes detected by each method is saved
  detection.yes.svm[i] <- yes.svm
  detection.yes.knn[i] <- yes.knn
  detection.yes.bayes[i] <- yes.bayes
  detection.yes.dtree[i] <- yes.dtree
  detection.yes.forest[i] <- yes.forest
  detection.yes.adaboost[i] <- yes.adaboost
  
  # The amount of No detected by each method is saved
  detection.no.svm[i] <- no.svm
  detection.no.knn[i] <- no.knn
  detection.no.bayes[i] <- no.bayes
  detection.no.dtree[i] <- no.dtree
  detection.no.forest[i] <- no.forest
  detection.no.adaboost[i] <- no.adaboost
  
  # The average global error is saved for each model
  error.global.svm[i] <- error.svm / nFolds
  error.global.knn[i] <- error.knn / nFolds
  error.global.bayes[i] <- error.bayes / nFolds
  error.global.dtree[i] <- error.dtree / nFolds
  error.global.forest[i] <- error.forest / nFolds
  error.global.adaboost[i] <- error.adaboost / nFolds
}

5. Plotting the Models Results

5.1. The results of the evaluation of the models are plotted.

  # The limits for Plot 1 are calculated
  yLim <- c(min(error.global.svm, error.global.knn, error.global.bayes, error.global.dtree, error.global.forest, error.global.adaboost) * 0.9,
            max(error.global.svm, error.global.knn, error.global.bayes, error.global.dtree, error.global.forest, error.global.adaboost) * 1.3)
  
  # The curves are plotted with the average global error of each model
  plot(error.global.svm, col = "magenta", type = "b", ylim = yLim, main = "Detection of people prone to CHD",
       xlab = "NÃºmero de iteraciÃ³n", ylab = "Global Error", cex.axis = 0.9)
  points(error.global.knn, col = "blue", type = "b")
  points(error.global.bayes, col = "red", type = "b")
  points(error.global.dtree, col = "lightblue3", type = "b")
  points(error.global.forest, col = "olivedrab", type = "b")
  points(error.global.adaboost, col = "orange3", type = "b")
  
  # The legend is added
  legend("topright", legend = c("SVM", "KNN", "Bayes", "DTree", "RandomF", "AdaBoost"),
         col = c("magenta", "blue", "red", "lightblue3", "olivedrab", "orange3"),
         lty = 1, lwd = 2, ncol = 6, cex = 0.45)

  # The limits for Plot 2 are setted
  yLim <- c(0, nYes)
  
  # The curves of the Yes detection are plotted
  plot(detection.yes.svm, col = "magenta", type = "b", ylim = yLim, main = "Detection of people who are prone to CHD",
       xlab = "NÃºmero de iteraciÃ³n", ylab = "Yes detected", cex.axis = 0.9)
  points(detection.yes.knn, col = "blue", type = "b")
  points(detection.yes.bayes, col = "red", type = "b")
  points(detection.yes.dtree, col = "lightblue3", type = "b")
  points(detection.yes.forest, col = "olivedrab", type = "b")
  points(detection.yes.adaboost, col = "orange3", type = "b")
  
  # The legend is added
  legend("topright", legend = c("SVM", "KNN", "Bayes", "DTree", "RandomF", "AdaBoost"),
         col = c("magenta", "blue", "red", "lightblue3", "olivedrab", "orange3"),
         lty = 1, lwd = 2, ncol = 6, cex = 0.45)

  # The limits for Plot 3 are setted
  yLim <- c(0, nNo)
  
  # The curves of the No detection are plotted
  plot(detection.no.svm, col = "magenta", type = "b", ylim = yLim, main = "Detection of people who are NOT prone to CHD",
       xlab = "NÃºmero de iteraciÃ³n", ylab = "No detected", cex.axis = 0.9)
  points(detection.no.knn, col = "blue", type = "b")
  points(detection.no.bayes, col = "red", type = "b")
  points(detection.no.dtree, col = "lightblue3", type = "b")
  points(detection.no.forest, col = "olivedrab", type = "b")
  points(detection.no.adaboost, col = "orange3", type = "b")
  
  # The legend is added
  legend("topright", legend = c("SVM", "KNN", "Bayes", "DTree", "RandomF", "AdaBoost"),
         col = c("magenta", "blue", "red", "lightblue3", "olivedrab", "orange3"),
         lty = 1, lwd = 2, ncol = 6, cex = 0.45)

5.2. The average Errors of the models are shown

  mean(error.global.svm)

## [1] 28.56389

  mean(error.global.knn)

## [1] 32.82515

  mean(error.global.bayes)

## [1] 29.03593

  mean(error.global.dtree)

## [1] 31.5012

  mean(error.global.forest)

## [1] 31.11583

  mean(error.global.adaboost)

## [1] 35.24517

5.3. The Yes detected by each models are shown

  mean(detection.yes.svm)

## [1] 79.6

  mean(detection.yes.knn)

## [1] 64.9

  mean(detection.yes.bayes)

## [1] 98.6

  mean(detection.yes.dtree)

## [1] 77.2

  mean(detection.yes.forest)

## [1] 68.5

  mean(detection.yes.adaboost)

## [1] 70

5.4. The No detected by each models are shown

  mean(detection.no.svm)

## [1] 250.3

  mean(detection.no.knn)

## [1] 245.4

  mean(detection.no.bayes)

## [1] 229.2

  mean(detection.no.dtree)

## [1] 239.2

  mean(detection.no.forest)

## [1] 249.6

  mean(detection.no.adaboost)

## [1] 229

<< Home