Data Mining - ggplot2
http://docs.ggplot2.org/current
library(ggplot2)
library(dplyr)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# Scatter plots
# 1.
x <- ggplot(iris, aes(Sepal.Length, Sepal.Width))
x + geom_point()
x + geom_point(aes(colour = Species))
x + geom_point(aes(colour = Species, size = Petal.Width)) # colour 구분은 factor 필요!!!
# 2.
x <- 1:50
y <- sapply(x, function(x) x/(x+1))
df <- data.frame(x, y)
head(df)
## x y
## 1 1 0.5000000
## 2 2 0.6666667
## 3 3 0.7500000
## 4 4 0.8000000
## 5 5 0.8333333
## 6 6 0.8571429
ggplot(df, aes(x, y)) + geom_point()
# 3.
head(diamonds)
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
ggplot(diamonds, aes(x = carat, y = price)) + geom_point(aes(color = color))
# Histograms
# 1
x <- ggplot(iris, aes(Sepal.Length))
x + geom_histogram(binwidth = 0.1)
x + geom_histogram(binwidth = 0.1, aes(fill = Species))
x + geom_histogram(fill = "red", alpha = 0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# 2
ggplot(diamonds) + geom_histogram(aes(x = carat))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(diamonds) + geom_density(aes(x = carat), fill = "pink")
# Line Charts
# 1.
head(economics)
## # A tibble: 6 × 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <int> <dbl> <dbl> <int>
## 1 1967-07-01 507.4 198712 12.5 4.5 2944
## 2 1967-08-01 510.5 198911 12.5 4.7 2945
## 3 1967-09-01 516.3 199113 11.7 4.6 2958
## 4 1967-10-01 512.9 199311 12.5 4.9 3143
## 5 1967-11-01 518.1 199498 12.5 4.7 3066
## 6 1967-12-01 525.8 199657 12.1 4.8 3018
x <- ggplot(economics)
x + geom_line(aes(x = date, y = unemploy))
x + geom_line(aes(x = date, y = unemploy), colour = "blue", size = 2)
x + geom_line(aes(x = date, y = unemploy), linetype = 2) +
geom_line(aes(x = date, y = pce), colour = "#CC79A7")
# 2.
head(Orange)
## Tree age circumference
## 1 1 118 30
## 2 1 484 58
## 3 1 664 87
## 4 1 1004 115
## 5 1 1231 120
## 6 1 1372 142
x <- ggplot(Orange, aes(age, circumference))
x + geom_line(aes(colour = Tree))
x + geom_line(aes(colour = Tree)) + geom_point()
ggplot(Orange, aes(age, circumference, colour = Tree)) + geom_line() + geom_point()
# 3.
library(lubridate)
eco = economics[which(year(economics$date) >= 2000), ] # 2000년 이후 데이터만 추출
head(eco)
## # A tibble: 6 × 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <int> <dbl> <dbl> <int>
## 1 2000-01-01 6564.7 280976 4.7 5.8 5708
## 2 2000-02-01 6648.7 281190 4.2 6.1 5858
## 3 2000-03-01 6714.8 281409 3.9 6.0 5733
## 4 2000-04-01 6701.0 281653 4.4 6.1 5481
## 5 2000-05-01 6737.2 281877 4.2 5.8 5758
## 6 2000-06-01 6773.6 282126 4.2 5.7 5651
summary(eco)
## date pce pop psavert
## Min. :2000-01-01 Min. : 6565 Min. :280976 Min. : 1.900
## 1st Qu.:2003-10-24 1st Qu.: 7935 1st Qu.:291749 1st Qu.: 3.800
## Median :2007-08-16 Median : 9744 Median :302407 Median : 4.700
## Mean :2007-08-16 Mean : 9420 Mean :301802 Mean : 4.763
## 3rd Qu.:2011-06-08 3rd Qu.:10689 3rd Qu.:311977 3rd Qu.: 5.600
## Max. :2015-04-01 Max. :12162 Max. :320887 Max. :10.500
## uempmed unemploy
## Min. : 5.20 Min. : 5481
## 1st Qu.: 8.60 1st Qu.: 7391
## Median :10.00 Median : 8576
## Mean :12.38 Mean : 9631
## 3rd Qu.:17.00 3rd Qu.:12115
## Max. :25.20 Max. :15352
eco$year = factor(year(eco$date)) # year
eco$month = month(eco$date, label = T) # month name
g = ggplot(eco, aes(x = month, y = pop/1000))
g + geom_line(aes(color = year, group = year)) + labs(title = "Population Growth", x = "Month", y = "Population")
# Bar Charts
# 1.
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
x <- ggplot(mtcars, aes(factor(cyl)))
x + geom_bar(aes(fill = factor(cyl)), width = 0.5)
x + geom_bar(aes(fill = factor(gear)), width = 0.5)
x + geom_bar(aes(fill = factor(gear)), width = 0.5) + coord_flip()
# 2.
x <- ggplot(mtcars, aes(factor(cyl), mpg))
x + geom_bar(aes(fill = factor(cyl)), width = 0.5, stat = "identity")
# 3.
head(diamonds)
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
summary(diamonds)
## carat cut color clarity
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.0100 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
##
## y z
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.710 Median : 3.530
## Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :58.900 Max. :31.800
##
x <- ggplot(diamonds, aes(price))
x + geom_bar(aes(fill = cut), binwidth = 3000)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
x + geom_bar(aes(fill = cut), binwidth = 3000, position = "dodge")
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
x + geom_bar(aes(fill = cut), binwidth = 3000, position = "fill")
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
# Pie Charts
x <- ggplot(mtcars, aes(x = factor(1), fill = factor(cyl)))
x + geom_bar(width = 1) + coord_polar(theta = "y")
# Box / Violin Plots
# 1
x <- ggplot(mtcars, aes(x = factor(cyl), y = mpg))
x + geom_boxplot()
x + geom_boxplot() + geom_jitter()
x + geom_boxplot(aes(fill = factor(cyl)), outlier.colour = "red", outlier.size = 4)
# 2
ggplot(diamonds, aes(x = 1, y = carat)) + geom_boxplot() # x = 1 : 전체 데이터
ggplot(diamonds, aes(x = cut, y = carat)) + geom_boxplot() # x = factor
ggplot(diamonds, aes(x = cut, y = carat)) + geom_violin()
ggplot(diamonds, aes(x = cut, y = carat)) + geom_violin() + geom_point()
# Facets
head(diamonds)
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
summary(diamonds)
## carat cut color clarity
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.0100 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
##
## y z
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.710 Median : 3.530
## Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :58.900 Max. :31.800
##
# 1
x <- ggplot(diamonds, aes(price))
x + geom_histogram(bins = 10) + facet_wrap(~ cut)
x + geom_histogram(binwidth = 3000) + facet_wrap(~ cut)
x + geom_histogram(binwidth = 3000) + facet_grid(. ~ cut)
x + geom_histogram(binwidth = 3000) + facet_grid(color ~ cut)
# 2
d = ggplot(diamonds, aes(x = carat, y = price))
d + geom_point(aes(color = color))
d + geom_point(aes(color = color)) + facet_wrap(~color)
d + geom_point(aes(color = color)) + facet_grid(cut ~ clarity)
# Density
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
x = ggplot(iris, aes(Petal.Length))
x + geom_density()
x + geom_density(aes(color = Species))