Data Mining - ggplot2
http://docs.ggplot2.org/current
library(ggplot2)
library(dplyr)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
# Scatter plots

# 1.
x <- ggplot(iris, aes(Sepal.Length, Sepal.Width))
x + geom_point()

x + geom_point(aes(colour = Species))

x + geom_point(aes(colour = Species, size = Petal.Width))   # colour 구분은 factor 필요!!!

# 2. 
x <- 1:50
y <- sapply(x, function(x) x/(x+1))
df <- data.frame(x, y)
head(df)
##   x         y
## 1 1 0.5000000
## 2 2 0.6666667
## 3 3 0.7500000
## 4 4 0.8000000
## 5 5 0.8333333
## 6 6 0.8571429
ggplot(df, aes(x, y)) + geom_point()

# 3.
head(diamonds)
## # A tibble: 6 × 10
##   carat       cut color clarity depth table price     x     y     z
##   <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
ggplot(diamonds, aes(x = carat, y = price)) + geom_point(aes(color = color))

# Histograms

# 1
x <- ggplot(iris, aes(Sepal.Length))
x + geom_histogram(binwidth = 0.1)

x + geom_histogram(binwidth = 0.1, aes(fill = Species))

x + geom_histogram(fill = "red", alpha = 0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 2
ggplot(diamonds) + geom_histogram(aes(x = carat))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(diamonds) + geom_density(aes(x = carat), fill = "pink")

# Line Charts

# 1.
head(economics)
## # A tibble: 6 × 6
##         date   pce    pop psavert uempmed unemploy
##       <date> <dbl>  <int>   <dbl>   <dbl>    <int>
## 1 1967-07-01 507.4 198712    12.5     4.5     2944
## 2 1967-08-01 510.5 198911    12.5     4.7     2945
## 3 1967-09-01 516.3 199113    11.7     4.6     2958
## 4 1967-10-01 512.9 199311    12.5     4.9     3143
## 5 1967-11-01 518.1 199498    12.5     4.7     3066
## 6 1967-12-01 525.8 199657    12.1     4.8     3018
x <- ggplot(economics)
x + geom_line(aes(x = date, y = unemploy))

x + geom_line(aes(x = date, y = unemploy), colour = "blue", size = 2)

x + geom_line(aes(x = date, y = unemploy), linetype = 2) + 
    geom_line(aes(x = date, y = pce), colour = "#CC79A7")

# 2.
head(Orange)
##   Tree  age circumference
## 1    1  118            30
## 2    1  484            58
## 3    1  664            87
## 4    1 1004           115
## 5    1 1231           120
## 6    1 1372           142
x <- ggplot(Orange, aes(age, circumference))
x + geom_line(aes(colour = Tree))

x + geom_line(aes(colour = Tree)) + geom_point()

ggplot(Orange, aes(age, circumference, colour = Tree)) + geom_line() + geom_point()

# 3.
library(lubridate)
eco = economics[which(year(economics$date) >= 2000), ]   # 2000년 이후 데이터만 추출
head(eco)
## # A tibble: 6 × 6
##         date    pce    pop psavert uempmed unemploy
##       <date>  <dbl>  <int>   <dbl>   <dbl>    <int>
## 1 2000-01-01 6564.7 280976     4.7     5.8     5708
## 2 2000-02-01 6648.7 281190     4.2     6.1     5858
## 3 2000-03-01 6714.8 281409     3.9     6.0     5733
## 4 2000-04-01 6701.0 281653     4.4     6.1     5481
## 5 2000-05-01 6737.2 281877     4.2     5.8     5758
## 6 2000-06-01 6773.6 282126     4.2     5.7     5651
summary(eco)
##       date                 pce             pop            psavert      
##  Min.   :2000-01-01   Min.   : 6565   Min.   :280976   Min.   : 1.900  
##  1st Qu.:2003-10-24   1st Qu.: 7935   1st Qu.:291749   1st Qu.: 3.800  
##  Median :2007-08-16   Median : 9744   Median :302407   Median : 4.700  
##  Mean   :2007-08-16   Mean   : 9420   Mean   :301802   Mean   : 4.763  
##  3rd Qu.:2011-06-08   3rd Qu.:10689   3rd Qu.:311977   3rd Qu.: 5.600  
##  Max.   :2015-04-01   Max.   :12162   Max.   :320887   Max.   :10.500  
##     uempmed         unemploy    
##  Min.   : 5.20   Min.   : 5481  
##  1st Qu.: 8.60   1st Qu.: 7391  
##  Median :10.00   Median : 8576  
##  Mean   :12.38   Mean   : 9631  
##  3rd Qu.:17.00   3rd Qu.:12115  
##  Max.   :25.20   Max.   :15352
eco$year = factor(year(eco$date))           # year
eco$month = month(eco$date, label = T)      # month name

g = ggplot(eco, aes(x = month, y = pop/1000))
g + geom_line(aes(color = year, group = year)) + labs(title = "Population Growth", x = "Month", y = "Population")

# Bar Charts

# 1.
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
x <- ggplot(mtcars, aes(factor(cyl)))
x + geom_bar(aes(fill = factor(cyl)), width = 0.5)

x + geom_bar(aes(fill = factor(gear)), width = 0.5)

x + geom_bar(aes(fill = factor(gear)), width = 0.5) + coord_flip()

# 2.
x <- ggplot(mtcars, aes(factor(cyl), mpg))
x + geom_bar(aes(fill = factor(cyl)), width = 0.5, stat = "identity")

# 3.
head(diamonds)
## # A tibble: 6 × 10
##   carat       cut color clarity depth table price     x     y     z
##   <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
summary(diamonds)
##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
## 
x <- ggplot(diamonds, aes(price))
x + geom_bar(aes(fill = cut), binwidth = 3000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

x + geom_bar(aes(fill = cut), binwidth = 3000, position = "dodge")
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

x + geom_bar(aes(fill = cut), binwidth = 3000, position = "fill")
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

# Pie Charts

x <- ggplot(mtcars, aes(x = factor(1), fill = factor(cyl)))
x + geom_bar(width = 1) + coord_polar(theta = "y")

# Box / Violin Plots

# 1
x <- ggplot(mtcars, aes(x = factor(cyl), y = mpg))
x + geom_boxplot()

x + geom_boxplot() + geom_jitter()

x + geom_boxplot(aes(fill = factor(cyl)), outlier.colour = "red", outlier.size = 4)

# 2
ggplot(diamonds, aes(x = 1, y = carat)) + geom_boxplot()        # x = 1 : 전체 데이터

ggplot(diamonds, aes(x = cut, y = carat)) + geom_boxplot()      # x = factor

ggplot(diamonds, aes(x = cut, y = carat)) + geom_violin()

ggplot(diamonds, aes(x = cut, y = carat)) + geom_violin() + geom_point()

# Facets

head(diamonds)
## # A tibble: 6 × 10
##   carat       cut color clarity depth table price     x     y     z
##   <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
summary(diamonds)
##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
## 
# 1
x <- ggplot(diamonds, aes(price))
x + geom_histogram(bins = 10) + facet_wrap(~ cut)

x + geom_histogram(binwidth = 3000) + facet_wrap(~ cut)

x + geom_histogram(binwidth = 3000) + facet_grid(. ~ cut)

x + geom_histogram(binwidth = 3000) + facet_grid(color ~ cut)

# 2
d = ggplot(diamonds, aes(x = carat, y = price)) 
d + geom_point(aes(color = color))

d + geom_point(aes(color = color)) + facet_wrap(~color)

d + geom_point(aes(color = color)) + facet_grid(cut ~ clarity)

# Density

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
x = ggplot(iris, aes(Petal.Length))
x + geom_density()

x + geom_density(aes(color = Species))