The Iris flower data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 paper “The use of multiple measurements in taxonomic problems as an example of linear discriminant analysis.” The dataset contains four features (length and width of sepals and petals) of 50 samples of three species of Iris (Iris setosa, Iris virginica, and Iris versicolor). These measures were used to create a linear discriminant model to classify the species.
In this project, I practiced some data visualization techniques on the Iris dataset with R by performing frequency analysis, density analysis, histogram, scatterplot, and correlation box plot.
Iris Flower Species
library(ggplot2)
library(readr)
library(gridExtra)
library(grid)
library(plyr)
iris=read.csv('./Iris.csv')
iris[sample(nrow(iris),20),]
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 112 112 6.4 2.7 5.3 1.9 Iris-virginica
## 34 34 5.5 4.2 1.4 0.2 Iris-setosa
## 96 96 5.7 3.0 4.2 1.2 Iris-versicolor
## 66 66 6.7 3.1 4.4 1.4 Iris-versicolor
## 23 23 4.6 3.6 1.0 0.2 Iris-setosa
## 102 102 5.8 2.7 5.1 1.9 Iris-virginica
## 137 137 6.3 3.4 5.6 2.4 Iris-virginica
## 81 81 5.5 2.4 3.8 1.1 Iris-versicolor
## 126 126 7.2 3.2 6.0 1.8 Iris-virginica
## 123 123 7.7 2.8 6.7 2.0 Iris-virginica
## 131 131 7.4 2.8 6.1 1.9 Iris-virginica
## 150 150 5.9 3.0 5.1 1.8 Iris-virginica
## 47 47 5.1 3.8 1.6 0.2 Iris-setosa
## 36 36 5.0 3.2 1.2 0.2 Iris-setosa
## 44 44 5.0 3.5 1.6 0.6 Iris-setosa
## 117 117 6.5 3.0 5.5 1.8 Iris-virginica
## 120 120 6.0 2.2 5.0 1.5 Iris-virginica
## 41 41 5.0 3.5 1.3 0.3 Iris-setosa
## 114 114 5.7 2.5 5.0 2.0 Iris-virginica
## 33 33 5.2 4.1 1.5 0.1 Iris-setosa
# For Sepal length (cm)
HisSl <- ggplot(data=iris, aes(x=SepalLengthCm))+
geom_histogram(binwidth=0.2, color="black", aes(fill=Species)) +
xlab("Sepal Length (cm)") +
ylab("Frequency") +
theme(legend.position="none")+
ggtitle("Histogram of Sepal Length")+
geom_vline(data=iris, aes(xintercept = mean(SepalLengthCm)),linetype="dashed",color="grey")
# For Sepal width (cm)
HistSw <- ggplot(data=iris, aes(x=SepalWidthCm)) +
geom_histogram(binwidth=0.2, color="black", aes(fill=Species)) +
xlab("Sepal Width (cm)") +
ylab("Frequency") +
theme(legend.position="none")+
ggtitle("Histogram of Sepal Width")+
geom_vline(data=iris, aes(xintercept = mean(SepalWidthCm)),linetype="dashed",color="grey")
# For Petal length (cm)
HistPl <- ggplot(data=iris, aes(x=PetalLengthCm))+
geom_histogram(binwidth=0.2, color="black", aes(fill=Species)) +
xlab("Petal Length (cm)") +
ylab("Frequency") +
theme(legend.position="none")+
ggtitle("Histogram of Petal Length")+
geom_vline(data=iris, aes(xintercept = mean(PetalLengthCm)),
linetype="dashed",color="grey")
# For Petal width (cm)
HistPw <- ggplot(data=iris, aes(x=PetalWidthCm))+
geom_histogram(binwidth=0.2, color="black", aes(fill=Species)) +
xlab("Petal Width (cm)") +
ylab("Frequency") +
theme(legend.position="right" )+
ggtitle("Histogram of Petal Width")+
geom_vline(data=iris, aes(xintercept = mean(PetalWidthCm)),linetype="dashed",color="grey")
# Plot all visualizations in a grid
grid.arrange(HisSl + ggtitle(""),
HistSw + ggtitle(""),
HistPl + ggtitle(""),
HistPw + ggtitle(""),
nrow = 2,
top = textGrob("Iris Frequency Histogram", gp=gpar(fontsize=14))
)
DhistPl <- ggplot(iris, aes(x=PetalLengthCm, colour=Species, fill=Species)) +
geom_density(alpha=.3) +
geom_vline(aes(xintercept=mean(PetalLengthCm), colour=Species),linetype="dashed",color="grey", size=1)+
xlab("Petal Length (cm)") +
ylab("Density")+
theme(legend.position="none")
DhistPw <- ggplot(iris, aes(x=PetalWidthCm, colour=Species, fill=Species)) +
geom_density(alpha=.3) +
geom_vline(aes(xintercept=mean(PetalWidthCm), colour=Species),linetype="dashed",color="grey", size=1)+
xlab("Petal Width (cm)") +
ylab("Density")
DhistSw <- ggplot(iris, aes(x=SepalWidthCm, colour=Species, fill=Species)) +
geom_density(alpha=.3) +
geom_vline(aes(xintercept=mean(SepalWidthCm), colour=Species), linetype="dashed",color="grey", size=1)+
xlab("Sepal Width (cm)") +
ylab("Density")+
theme(legend.position="none")
DhistSl <- ggplot(iris, aes(x=SepalLengthCm, colour=Species, fill=Species)) +
geom_density(alpha=.3) +
geom_vline(aes(xintercept=mean(SepalLengthCm), colour=Species),linetype="dashed", color="grey", size=1)+
xlab("Sepal Length (cm)") +
ylab("Density")+
theme(legend.position="none")
grid.arrange(DhistSl + ggtitle(""),
DhistSw + ggtitle(""),
DhistPl + ggtitle(""),
DhistPw + ggtitle(""),
nrow = 2,
top = textGrob("Iris Density Histogram", gp=gpar(fontsize=14))
)
BpSl <- ggplot(iris, aes(Species, SepalLengthCm, fill=Species)) +
geom_boxplot()+
scale_y_continuous("Sepal Length (cm)", breaks= seq(0,30, by=.5))+
theme(legend.position="none")
BpSw <- ggplot(iris, aes(Species, SepalWidthCm, fill=Species)) +
geom_boxplot()+
scale_y_continuous("Sepal Width (cm)", breaks= seq(0,30, by=.5))+
theme(legend.position="none")
BpPl <- ggplot(iris, aes(Species, PetalLengthCm, fill=Species)) +
geom_boxplot()+
scale_y_continuous("Petal Length (cm)", breaks= seq(0,30, by=.5))+
theme(legend.position="none")
BpPw <- ggplot(iris, aes(Species, PetalWidthCm, fill=Species)) +
geom_boxplot()+
scale_y_continuous("Petal Width (cm)", breaks= seq(0,30, by=.5))+
labs(title = "Iris Box Plot", x = "Species")
grid.arrange(BpSl + ggtitle(""),
BpSw + ggtitle(""),
BpPl + ggtitle(""),
BpPw + ggtitle(""),
nrow = 2,
top = textGrob("Iris Sepal and Petal Box Plot",
gp=gpar(fontsize=14))
)
ggplot(data = iris, aes(x = PetalLengthCm, y = PetalWidthCm))+
xlab("Petal Length")+
ylab("Petal Width") +
geom_point(aes(color = Species,shape=Species))+
geom_smooth(method='lm')+
ggtitle("Iris Petal Length vs Width")
## `geom_smooth()` using formula 'y ~ x'
ggplot(data=iris, aes(x = SepalLengthCm, y = SepalWidthCm)) +
xlab("Sepal Length") +
ylab("Sepal Width") +
geom_point(aes(color=Species, shape=Species)) +
geom_smooth(method='lm')+
ggtitle("Iris Sepal Length vs Width")
## `geom_smooth()` using formula 'y ~ x'
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data = iris[1:4],
title = "Iris Correlation Plot",
upper = list(continuous = wrap("cor", size = 5)),
lower = list(continuous = "smooth")
)