The Iris flower data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 paper “The use of multiple measurements in taxonomic problems as an example of linear discriminant analysis.” The dataset contains four features (length and width of sepals and petals) of 50 samples of three species of Iris (Iris setosa, Iris virginica, and Iris versicolor). These measures were used to create a linear discriminant model to classify the species.

In this project, I practiced some data visualization techniques on the Iris dataset with R by performing frequency analysis, density analysis, histogram, scatterplot, and correlation box plot.

Iris Flower Species


Import the required packages and import Iris dataset

library(ggplot2)
library(readr)
library(gridExtra)
library(grid)
library(plyr)

iris=read.csv('./Iris.csv')

iris[sample(nrow(iris),20),]
##      Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm         Species
## 112 112           6.4          2.7           5.3          1.9  Iris-virginica
## 34   34           5.5          4.2           1.4          0.2     Iris-setosa
## 96   96           5.7          3.0           4.2          1.2 Iris-versicolor
## 66   66           6.7          3.1           4.4          1.4 Iris-versicolor
## 23   23           4.6          3.6           1.0          0.2     Iris-setosa
## 102 102           5.8          2.7           5.1          1.9  Iris-virginica
## 137 137           6.3          3.4           5.6          2.4  Iris-virginica
## 81   81           5.5          2.4           3.8          1.1 Iris-versicolor
## 126 126           7.2          3.2           6.0          1.8  Iris-virginica
## 123 123           7.7          2.8           6.7          2.0  Iris-virginica
## 131 131           7.4          2.8           6.1          1.9  Iris-virginica
## 150 150           5.9          3.0           5.1          1.8  Iris-virginica
## 47   47           5.1          3.8           1.6          0.2     Iris-setosa
## 36   36           5.0          3.2           1.2          0.2     Iris-setosa
## 44   44           5.0          3.5           1.6          0.6     Iris-setosa
## 117 117           6.5          3.0           5.5          1.8  Iris-virginica
## 120 120           6.0          2.2           5.0          1.5  Iris-virginica
## 41   41           5.0          3.5           1.3          0.3     Iris-setosa
## 114 114           5.7          2.5           5.0          2.0  Iris-virginica
## 33   33           5.2          4.1           1.5          0.1     Iris-setosa

Frequency analysis with Histogram

# For Sepal length (cm)

HisSl <- ggplot(data=iris, aes(x=SepalLengthCm))+
  geom_histogram(binwidth=0.2, color="black", aes(fill=Species)) + 
  xlab("Sepal Length (cm)") +
  ylab("Frequency") +
  theme(legend.position="none")+
  ggtitle("Histogram of Sepal Length")+
  geom_vline(data=iris, aes(xintercept = mean(SepalLengthCm)),linetype="dashed",color="grey")

# For Sepal width (cm)

HistSw <- ggplot(data=iris, aes(x=SepalWidthCm)) +
  geom_histogram(binwidth=0.2, color="black", aes(fill=Species)) + 
  xlab("Sepal Width (cm)") +  
  ylab("Frequency") +
  theme(legend.position="none")+
  ggtitle("Histogram of Sepal Width")+
  geom_vline(data=iris, aes(xintercept = mean(SepalWidthCm)),linetype="dashed",color="grey")

# For Petal length (cm)

HistPl <- ggplot(data=iris, aes(x=PetalLengthCm))+
  geom_histogram(binwidth=0.2, color="black", aes(fill=Species)) + 
  xlab("Petal Length (cm)") +  
  ylab("Frequency") +
  theme(legend.position="none")+
  ggtitle("Histogram of Petal Length")+
  geom_vline(data=iris, aes(xintercept = mean(PetalLengthCm)),
             linetype="dashed",color="grey")

# For Petal width (cm)

HistPw <- ggplot(data=iris, aes(x=PetalWidthCm))+
  geom_histogram(binwidth=0.2, color="black", aes(fill=Species)) + 
  xlab("Petal Width (cm)") +  
  ylab("Frequency") + 
  theme(legend.position="right" )+
  ggtitle("Histogram of Petal Width")+
  geom_vline(data=iris, aes(xintercept = mean(PetalWidthCm)),linetype="dashed",color="grey")


# Plot all visualizations in a grid

grid.arrange(HisSl + ggtitle(""),
             HistSw + ggtitle(""),
             HistPl + ggtitle(""),
             HistPw + ggtitle(""),
             nrow = 2,
             top = textGrob("Iris Frequency Histogram", gp=gpar(fontsize=14))
)

Density analysis with Histogram

DhistPl <- ggplot(iris, aes(x=PetalLengthCm, colour=Species, fill=Species)) +
  geom_density(alpha=.3) +
  geom_vline(aes(xintercept=mean(PetalLengthCm),  colour=Species),linetype="dashed",color="grey", size=1)+
  xlab("Petal Length (cm)") +  
  ylab("Density")+
  theme(legend.position="none")

DhistPw <- ggplot(iris, aes(x=PetalWidthCm, colour=Species, fill=Species)) +
  geom_density(alpha=.3) +
  geom_vline(aes(xintercept=mean(PetalWidthCm),  colour=Species),linetype="dashed",color="grey", size=1)+
  xlab("Petal Width (cm)") +  
  ylab("Density")

DhistSw <- ggplot(iris, aes(x=SepalWidthCm, colour=Species, fill=Species)) +
  geom_density(alpha=.3) +
  geom_vline(aes(xintercept=mean(SepalWidthCm),  colour=Species), linetype="dashed",color="grey", size=1)+
  xlab("Sepal Width (cm)") +  
  ylab("Density")+
  theme(legend.position="none")

DhistSl <- ggplot(iris, aes(x=SepalLengthCm, colour=Species, fill=Species)) +
  geom_density(alpha=.3) +
  geom_vline(aes(xintercept=mean(SepalLengthCm),  colour=Species),linetype="dashed", color="grey", size=1)+
  xlab("Sepal Length (cm)") +  
  ylab("Density")+
  theme(legend.position="none")

grid.arrange(DhistSl + ggtitle(""),
             DhistSw + ggtitle(""),
             DhistPl + ggtitle(""),
             DhistPw + ggtitle(""),
             nrow = 2,
             top = textGrob("Iris Density Histogram", gp=gpar(fontsize=14))
)

Sepal and Petal Box Plot

BpSl <- ggplot(iris, aes(Species, SepalLengthCm, fill=Species)) + 
        geom_boxplot()+
        scale_y_continuous("Sepal Length (cm)", breaks= seq(0,30, by=.5))+
        theme(legend.position="none")


BpSw <- ggplot(iris, aes(Species, SepalWidthCm, fill=Species)) + 
        geom_boxplot()+
        scale_y_continuous("Sepal Width (cm)", breaks= seq(0,30, by=.5))+
        theme(legend.position="none")


BpPl <- ggplot(iris, aes(Species, PetalLengthCm, fill=Species)) + 
        geom_boxplot()+
        scale_y_continuous("Petal Length (cm)", breaks= seq(0,30, by=.5))+
        theme(legend.position="none")


BpPw <- ggplot(iris, aes(Species, PetalWidthCm, fill=Species)) + 
        geom_boxplot()+
        scale_y_continuous("Petal Width (cm)", breaks= seq(0,30, by=.5))+
        labs(title = "Iris Box Plot", x = "Species")

grid.arrange(BpSl + ggtitle(""),
             BpSw + ggtitle(""),
             BpPl + ggtitle(""),
             BpPw + ggtitle(""),
             nrow = 2,
             top = textGrob("Iris Sepal and Petal Box Plot", 
                            gp=gpar(fontsize=14))
)

Petal Length vs Width Scatterplot

ggplot(data = iris, aes(x = PetalLengthCm, y = PetalWidthCm))+
  xlab("Petal Length")+
  ylab("Petal Width") +
  geom_point(aes(color = Species,shape=Species))+
  geom_smooth(method='lm')+
  ggtitle("Iris Petal Length vs Width")
## `geom_smooth()` using formula 'y ~ x'

Sepal Length vs Width

ggplot(data=iris, aes(x = SepalLengthCm, y = SepalWidthCm)) +
  xlab("Sepal Length") + 
  ylab("Sepal Width") +
  geom_point(aes(color=Species, shape=Species)) +
  geom_smooth(method='lm')+
  ggtitle("Iris Sepal Length vs Width")
## `geom_smooth()` using formula 'y ~ x'

Correlation Box Plot

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(data = iris[1:4],
        title = "Iris Correlation Plot",
        upper = list(continuous = wrap("cor", size = 5)), 
        lower = list(continuous = "smooth")
)