data("iris")
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
Just for reference, here are pictures of the three flowers species:
Let’s take a look at the data itself. Let’s see the first 5 rows of data for each class:
subset(iris,Species == "setosa")[1:5,]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
subset(iris,Species == "versicolor")[1:5,]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 51 7.0 3.2 4.7 1.4 versicolor
## 52 6.4 3.2 4.5 1.5 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 54 5.5 2.3 4.0 1.3 versicolor
## 55 6.5 2.8 4.6 1.5 versicolor
subset(iris,Species == "virginica")[1:5,]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 101 6.3 3.3 6.0 2.5 virginica
## 102 5.8 2.7 5.1 1.9 virginica
## 103 7.1 3.0 5.9 2.1 virginica
## 104 6.3 2.9 5.6 1.8 virginica
## 105 6.5 3.0 5.8 2.2 virginica
A quick look at the data shows that Petal.Length of class setosa is shorter than the Petal.Length of other classes – is that true?
# Get column "Species" for all lines where Petal.Length < 2
subset(iris, Petal.Length < 2)[,"Species"]
## [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
## [11] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
## [21] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
## [31] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
## [41] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
boxplot(iris,col = c("#E69F00", "#56B4E9", "#009E73"))
Here’s how to interpret a boxplot:
Scatterplot matrices are very good visualization tools and may help identify correlations or lack of it:
pairs(iris[,1:4],col=iris[,5],oma=c(4,4,6,12))
par(xpd=TRUE)
legend(0.85,0.6, as.vector(unique(iris$Species)),fill=c(1,2,3))
Our target is to use all variables in the dataset except species as input variable and use it to generate a model which classifies the species.
library(tree)
iris_tree1 = tree(Species ~ .,data = iris)
plot(iris_tree1)
text(iris_tree1, pretty=1)
summary(iris_tree1)
##
## Classification tree:
## tree(formula = Species ~ ., data = iris)
## Variables actually used in tree construction:
## [1] "Petal.Length" "Petal.Width" "Sepal.Length"
## Number of terminal nodes: 6
## Residual mean deviance: 0.1253 = 18.05 / 144
## Misclassification error rate: 0.02667 = 4 / 150
iris_tree1
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 329.600 setosa ( 0.33333 0.33333 0.33333 )
## 2) Petal.Length < 2.45 50 0.000 setosa ( 1.00000 0.00000 0.00000 ) *
## 3) Petal.Length > 2.45 100 138.600 versicolor ( 0.00000 0.50000 0.50000 )
## 6) Petal.Width < 1.75 54 33.320 versicolor ( 0.00000 0.90741 0.09259 )
## 12) Petal.Length < 4.95 48 9.721 versicolor ( 0.00000 0.97917 0.02083 )
## 24) Sepal.Length < 5.15 5 5.004 versicolor ( 0.00000 0.80000 0.20000 ) *
## 25) Sepal.Length > 5.15 43 0.000 versicolor ( 0.00000 1.00000 0.00000 ) *
## 13) Petal.Length > 4.95 6 7.638 virginica ( 0.00000 0.33333 0.66667 ) *
## 7) Petal.Width > 1.75 46 9.635 virginica ( 0.00000 0.02174 0.97826 )
## 14) Petal.Length < 4.95 6 5.407 virginica ( 0.00000 0.16667 0.83333 ) *
## 15) Petal.Length > 4.95 40 0.000 virginica ( 0.00000 0.00000 1.00000 ) *
colnames(iris_tree1$frame)
## [1] "var" "n" "dev" "yval" "splits" "yprob"
iris_tree1$frame$dev
## [1] 329.583687 0.000000 138.629436 33.317509 9.721422 5.004024
## [7] 0.000000 7.638170 9.635384 5.406735 0.000000
iris_tree2 = tree(Species ~ .,data = iris, split = "gini")
plot(iris_tree2)
text(iris_tree2, pretty=1)
iris_tree2
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 329.600 setosa ( 0.33333 0.33333 0.33333 )
## 2) Petal.Length < 1.35 11 0.000 setosa ( 1.00000 0.00000 0.00000 ) *
## 3) Petal.Length > 1.35 139 303.600 versicolor ( 0.28058 0.35971 0.35971 )
## 6) Sepal.Width < 2.35 7 5.742 versicolor ( 0.00000 0.85714 0.14286 ) *
## 7) Sepal.Width > 2.35 132 288.900 virginica ( 0.29545 0.33333 0.37121 )
## 14) Sepal.Width < 2.55 11 14.420 versicolor ( 0.00000 0.63636 0.36364 )
## 28) Petal.Length < 4.25 6 0.000 versicolor ( 0.00000 1.00000 0.00000 ) *
## 29) Petal.Length > 4.25 5 5.004 virginica ( 0.00000 0.20000 0.80000 ) *
## 15) Sepal.Width > 2.55 121 265.000 virginica ( 0.32231 0.30579 0.37190 )
## 30) Petal.Width < 0.25 26 0.000 setosa ( 1.00000 0.00000 0.00000 ) *
## 31) Petal.Width > 0.25 95 188.700 virginica ( 0.13684 0.38947 0.47368 )
## 62) Petal.Width < 1.75 52 79.640 versicolor ( 0.25000 0.69231 0.05769 )
## 124) Petal.Length < 2.7 13 0.000 setosa ( 1.00000 0.00000 0.00000 ) *
## 125) Petal.Length > 2.7 39 21.150 versicolor ( 0.00000 0.92308 0.07692 )
## 250) Petal.Length < 4.95 34 0.000 versicolor ( 0.00000 1.00000 0.00000 ) *
## 251) Petal.Length > 4.95 5 6.730 virginica ( 0.00000 0.40000 0.60000 ) *
## 63) Petal.Width > 1.75 43 9.499 virginica ( 0.00000 0.02326 0.97674 )
## 126) Sepal.Length < 6.05 7 5.742 virginica ( 0.00000 0.14286 0.85714 ) *
## 127) Sepal.Length > 6.05 36 0.000 virginica ( 0.00000 0.00000 1.00000 ) *
we can also calculate the deviance with:
deviance = -2*(3*50*log(1/3))
deviance
## [1] 329.5837
library(rpart)
library(rpart.plot)
rpart_iris1 = rpart(Species ~ .,method = 'class', data=iris)
rpart_iris1
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
rpart.plot(rpart_iris1)
rpart_iris1$splits
## count ncat improve index adj
## Petal.Length 150 -1 50.0000000 2.45 0.0000000
## Petal.Width 150 -1 50.0000000 0.80 0.0000000
## Sepal.Length 150 -1 34.1640502 5.45 0.0000000
## Sepal.Width 150 1 19.0385075 3.35 0.0000000
## Petal.Width 0 -1 1.0000000 0.80 1.0000000
## Sepal.Length 0 -1 0.9200000 5.45 0.7600000
## Sepal.Width 0 1 0.8333333 3.35 0.5000000
## Petal.Width 100 -1 38.9694042 1.75 0.0000000
## Petal.Length 100 -1 37.3535354 4.75 0.0000000
## Sepal.Length 100 -1 10.6868687 6.15 0.0000000
## Sepal.Width 100 -1 3.5555556 2.45 0.0000000
## Petal.Length 0 -1 0.9100000 4.75 0.8043478
## Sepal.Length 0 -1 0.7300000 6.15 0.4130435
## Sepal.Width 0 -1 0.6700000 2.95 0.2826087
rpart_iris1
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
in the root node, we have 150 obs, majority class is setosa, # of setosa is 50, all predicted as setosa, so # of missclass = 150 - 50 =100
in the node(2) : total obs 50, all setosa, # miss = 50 - 50 = 0
summary(rpart_iris1)
## Call:
## rpart(formula = Species ~ ., data = iris, method = "class")
## n= 150
##
## CP nsplit rel error xerror xstd
## 1 0.50 0 1.00 1.15 0.05180090
## 2 0.44 1 0.50 0.68 0.06096994
## 3 0.01 2 0.06 0.11 0.03192700
##
## Variable importance
## Petal.Width Petal.Length Sepal.Length Sepal.Width
## 34 31 21 14
##
## Node number 1: 150 observations, complexity param=0.5
## predicted class=setosa expected loss=0.6666667 P(node) =1
## class counts: 50 50 50
## probabilities: 0.333 0.333 0.333
## left son=2 (50 obs) right son=3 (100 obs)
## Primary splits:
## Petal.Length < 2.45 to the left, improve=50.00000, (0 missing)
## Petal.Width < 0.8 to the left, improve=50.00000, (0 missing)
## Sepal.Length < 5.45 to the left, improve=34.16405, (0 missing)
## Sepal.Width < 3.35 to the right, improve=19.03851, (0 missing)
## Surrogate splits:
## Petal.Width < 0.8 to the left, agree=1.000, adj=1.00, (0 split)
## Sepal.Length < 5.45 to the left, agree=0.920, adj=0.76, (0 split)
## Sepal.Width < 3.35 to the right, agree=0.833, adj=0.50, (0 split)
##
## Node number 2: 50 observations
## predicted class=setosa expected loss=0 P(node) =0.3333333
## class counts: 50 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 3: 100 observations, complexity param=0.44
## predicted class=versicolor expected loss=0.5 P(node) =0.6666667
## class counts: 0 50 50
## probabilities: 0.000 0.500 0.500
## left son=6 (54 obs) right son=7 (46 obs)
## Primary splits:
## Petal.Width < 1.75 to the left, improve=38.969400, (0 missing)
## Petal.Length < 4.75 to the left, improve=37.353540, (0 missing)
## Sepal.Length < 6.15 to the left, improve=10.686870, (0 missing)
## Sepal.Width < 2.45 to the left, improve= 3.555556, (0 missing)
## Surrogate splits:
## Petal.Length < 4.75 to the left, agree=0.91, adj=0.804, (0 split)
## Sepal.Length < 6.15 to the left, agree=0.73, adj=0.413, (0 split)
## Sepal.Width < 2.95 to the left, agree=0.67, adj=0.283, (0 split)
##
## Node number 6: 54 observations
## predicted class=versicolor expected loss=0.09259259 P(node) =0.36
## class counts: 0 49 5
## probabilities: 0.000 0.907 0.093
##
## Node number 7: 46 observations
## predicted class=virginica expected loss=0.02173913 P(node) =0.3066667
## class counts: 0 1 45
## probabilities: 0.000 0.022 0.978
Now we can make predictions with using the predict
function. but if we do not specify type="class"
it will
give us the probability of the prediction results instead of the
predicted class.
pred_ir = predict(iris_tree1)
pred_ir
## setosa versicolor virginica
## 1 1 0.0000000 0.0000000
## 2 1 0.0000000 0.0000000
## 3 1 0.0000000 0.0000000
## 4 1 0.0000000 0.0000000
## 5 1 0.0000000 0.0000000
## 6 1 0.0000000 0.0000000
## 7 1 0.0000000 0.0000000
## 8 1 0.0000000 0.0000000
## 9 1 0.0000000 0.0000000
## 10 1 0.0000000 0.0000000
## 11 1 0.0000000 0.0000000
## 12 1 0.0000000 0.0000000
## 13 1 0.0000000 0.0000000
## 14 1 0.0000000 0.0000000
## 15 1 0.0000000 0.0000000
## 16 1 0.0000000 0.0000000
## 17 1 0.0000000 0.0000000
## 18 1 0.0000000 0.0000000
## 19 1 0.0000000 0.0000000
## 20 1 0.0000000 0.0000000
## 21 1 0.0000000 0.0000000
## 22 1 0.0000000 0.0000000
## 23 1 0.0000000 0.0000000
## 24 1 0.0000000 0.0000000
## 25 1 0.0000000 0.0000000
## 26 1 0.0000000 0.0000000
## 27 1 0.0000000 0.0000000
## 28 1 0.0000000 0.0000000
## 29 1 0.0000000 0.0000000
## 30 1 0.0000000 0.0000000
## 31 1 0.0000000 0.0000000
## 32 1 0.0000000 0.0000000
## 33 1 0.0000000 0.0000000
## 34 1 0.0000000 0.0000000
## 35 1 0.0000000 0.0000000
## 36 1 0.0000000 0.0000000
## 37 1 0.0000000 0.0000000
## 38 1 0.0000000 0.0000000
## 39 1 0.0000000 0.0000000
## 40 1 0.0000000 0.0000000
## 41 1 0.0000000 0.0000000
## 42 1 0.0000000 0.0000000
## 43 1 0.0000000 0.0000000
## 44 1 0.0000000 0.0000000
## 45 1 0.0000000 0.0000000
## 46 1 0.0000000 0.0000000
## 47 1 0.0000000 0.0000000
## 48 1 0.0000000 0.0000000
## 49 1 0.0000000 0.0000000
## 50 1 0.0000000 0.0000000
## 51 0 1.0000000 0.0000000
## 52 0 1.0000000 0.0000000
## 53 0 1.0000000 0.0000000
## 54 0 1.0000000 0.0000000
## 55 0 1.0000000 0.0000000
## 56 0 1.0000000 0.0000000
## 57 0 1.0000000 0.0000000
## 58 0 0.8000000 0.2000000
## 59 0 1.0000000 0.0000000
## 60 0 1.0000000 0.0000000
## 61 0 0.8000000 0.2000000
## 62 0 1.0000000 0.0000000
## 63 0 1.0000000 0.0000000
## 64 0 1.0000000 0.0000000
## 65 0 1.0000000 0.0000000
## 66 0 1.0000000 0.0000000
## 67 0 1.0000000 0.0000000
## 68 0 1.0000000 0.0000000
## 69 0 1.0000000 0.0000000
## 70 0 1.0000000 0.0000000
## 71 0 0.1666667 0.8333333
## 72 0 1.0000000 0.0000000
## 73 0 1.0000000 0.0000000
## 74 0 1.0000000 0.0000000
## 75 0 1.0000000 0.0000000
## 76 0 1.0000000 0.0000000
## 77 0 1.0000000 0.0000000
## 78 0 0.3333333 0.6666667
## 79 0 1.0000000 0.0000000
## 80 0 1.0000000 0.0000000
## 81 0 1.0000000 0.0000000
## 82 0 1.0000000 0.0000000
## 83 0 1.0000000 0.0000000
## 84 0 0.3333333 0.6666667
## 85 0 1.0000000 0.0000000
## 86 0 1.0000000 0.0000000
## 87 0 1.0000000 0.0000000
## 88 0 1.0000000 0.0000000
## 89 0 1.0000000 0.0000000
## 90 0 1.0000000 0.0000000
## 91 0 1.0000000 0.0000000
## 92 0 1.0000000 0.0000000
## 93 0 1.0000000 0.0000000
## 94 0 0.8000000 0.2000000
## 95 0 1.0000000 0.0000000
## 96 0 1.0000000 0.0000000
## 97 0 1.0000000 0.0000000
## 98 0 1.0000000 0.0000000
## 99 0 0.8000000 0.2000000
## 100 0 1.0000000 0.0000000
## 101 0 0.0000000 1.0000000
## 102 0 0.0000000 1.0000000
## 103 0 0.0000000 1.0000000
## 104 0 0.0000000 1.0000000
## 105 0 0.0000000 1.0000000
## 106 0 0.0000000 1.0000000
## 107 0 0.8000000 0.2000000
## 108 0 0.0000000 1.0000000
## 109 0 0.0000000 1.0000000
## 110 0 0.0000000 1.0000000
## 111 0 0.0000000 1.0000000
## 112 0 0.0000000 1.0000000
## 113 0 0.0000000 1.0000000
## 114 0 0.0000000 1.0000000
## 115 0 0.0000000 1.0000000
## 116 0 0.0000000 1.0000000
## 117 0 0.0000000 1.0000000
## 118 0 0.0000000 1.0000000
## 119 0 0.0000000 1.0000000
## 120 0 0.3333333 0.6666667
## 121 0 0.0000000 1.0000000
## 122 0 0.1666667 0.8333333
## 123 0 0.0000000 1.0000000
## 124 0 0.1666667 0.8333333
## 125 0 0.0000000 1.0000000
## 126 0 0.0000000 1.0000000
## 127 0 0.1666667 0.8333333
## 128 0 0.1666667 0.8333333
## 129 0 0.0000000 1.0000000
## 130 0 0.3333333 0.6666667
## 131 0 0.0000000 1.0000000
## 132 0 0.0000000 1.0000000
## 133 0 0.0000000 1.0000000
## 134 0 0.3333333 0.6666667
## 135 0 0.3333333 0.6666667
## 136 0 0.0000000 1.0000000
## 137 0 0.0000000 1.0000000
## 138 0 0.0000000 1.0000000
## 139 0 0.1666667 0.8333333
## 140 0 0.0000000 1.0000000
## 141 0 0.0000000 1.0000000
## 142 0 0.0000000 1.0000000
## 143 0 0.0000000 1.0000000
## 144 0 0.0000000 1.0000000
## 145 0 0.0000000 1.0000000
## 146 0 0.0000000 1.0000000
## 147 0 0.0000000 1.0000000
## 148 0 0.0000000 1.0000000
## 149 0 0.0000000 1.0000000
## 150 0 0.0000000 1.0000000
pred_ir = predict(iris_tree1,type = "class")
pred_ir
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa setosa setosa setosa
## [19] setosa setosa setosa setosa setosa setosa
## [25] setosa setosa setosa setosa setosa setosa
## [31] setosa setosa setosa setosa setosa setosa
## [37] setosa setosa setosa setosa setosa setosa
## [43] setosa setosa setosa setosa setosa setosa
## [49] setosa setosa versicolor versicolor versicolor versicolor
## [55] versicolor versicolor versicolor versicolor versicolor versicolor
## [61] versicolor versicolor versicolor versicolor versicolor versicolor
## [67] versicolor versicolor versicolor versicolor virginica versicolor
## [73] versicolor versicolor versicolor versicolor versicolor virginica
## [79] versicolor versicolor versicolor versicolor versicolor virginica
## [85] versicolor versicolor versicolor versicolor versicolor versicolor
## [91] versicolor versicolor versicolor versicolor versicolor versicolor
## [97] versicolor versicolor versicolor versicolor virginica virginica
## [103] virginica virginica virginica virginica versicolor virginica
## [109] virginica virginica virginica virginica virginica virginica
## [115] virginica virginica virginica virginica virginica virginica
## [121] virginica virginica virginica virginica virginica virginica
## [127] virginica virginica virginica virginica virginica virginica
## [133] virginica virginica virginica virginica virginica virginica
## [139] virginica virginica virginica virginica virginica virginica
## [145] virginica virginica virginica virginica virginica virginica
## Levels: setosa versicolor virginica
Important thing when using tree is that we need to notice that the
tree has different parameters from the rpart package. tree
has three parameters: - mincut
: the minimum number of
observations in a leaf node , default =5 - minsize
: the
smallest allowed node size for a split to occur, default = 10 -
mindev
: the within-node deviance must be at least this
times that of the root node for the node to be split, default is
0.01
To produce a tree that fits the data perfectly, we set mindev = 0 and minsize = 2 (if the limit on tree depth allows such tree (31))
iris_tree_perfect = tree(Species~.,data=iris,mindev=0,minsize=2)
plot(iris_tree_perfect)
text(iris_tree_perfect,pretty=1)
We can try some other parameters:
iris_tr2=tree(Species~.,data=iris,minsize=2)
iris_tr3=tree(Species~.,data=iris,mindev=0.0)
iris_tr4=tree(Species~.,data=iris,minsize=2,mindev=0.0)
plot(iris_tr4)
text(iris_tr4,cex=0.8)
summary(iris_tr4)
##
## Classification tree:
## tree(formula = Species ~ ., data = iris, minsize = 2, mindev = 0)
## Variables actually used in tree construction:
## [1] "Petal.Length" "Petal.Width" "Sepal.Length"
## Number of terminal nodes: 9
## Residual mean deviance: 0 = 0 / 141
## Misclassification error rate: 0 = 0 / 150
pred_iris4=predict(iris_tr4,type="class")
table(iris$Species,pred_iris4)
## pred_iris4
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 50 0
## virginica 0 0 50
tree rpart
mincut ~ minbucket
minsize ~ minsplit
mindev ~ cp
Notice: when we reached here in the classroom, Hocam said that :
maybe in the exam I will ask you to draw the tree from the output of tree or vise versa