정형 데이터 분석
# 1. Barplot
# 1-1. 국가별 면적과 인구밀도 비교

d <- read.csv("data/nations_land_area.csv", sep=",")

names(d)
## [1] "Nation"     "LandArea"   "PopDensity"
dim(d)                  # row count / column count
## [1] 17  3
str(d)
## 'data.frame':    17 obs. of  3 variables:
##  $ Nation    : Factor w/ 17 levels "Austrailia","Bangladesh",..: 11 2 16 10 17 3 5 7 8 9 ...
##  $ LandArea  : int  99828 144000 36190 377910 9632030 9984670 43090 551500 357050 301340 ...
##  $ PopDensity: int  485 985 632 339 31 3 126 110 232 193 ...
# ordering
d[order(d$Nation), ]
##         Nation LandArea PopDensity
## 15  Austrailia  7741220          3
## 2   Bangladesh   144000        985
## 6       Canada  9984670          3
## 17      Chaina  9598088        140
## 7      Denmark    43090        126
## 14     England   243610        246
## 8       France   551500        110
## 9      Germany   357050        232
## 10       Italy   301340        193
## 4        Japan   377910        339
## 1        Korea    99828        485
## 11 Netherlands    41530        392
## 16  NewZealand   267710         15
## 12      Sweden   450290         20
## 13       Swiss    41280        176
## 3       Taiwan    36190        632
## 5          USA  9632030         31
d[order(d$LandArea, decreasing = T), ]  
##         Nation LandArea PopDensity
## 6       Canada  9984670          3
## 5          USA  9632030         31
## 17      Chaina  9598088        140
## 15  Austrailia  7741220          3
## 8       France   551500        110
## 12      Sweden   450290         20
## 4        Japan   377910        339
## 9      Germany   357050        232
## 10       Italy   301340        193
## 16  NewZealand   267710         15
## 14     England   243610        246
## 2   Bangladesh   144000        985
## 1        Korea    99828        485
## 7      Denmark    43090        126
## 11 Netherlands    41530        392
## 13       Swiss    41280        176
## 3       Taiwan    36190        632
d$Nation
##  [1] Korea       Bangladesh  Taiwan      Japan       USA        
##  [6] Canada      Denmark     France      Germany     Italy      
## [11] Netherlands Sweden      Swiss       England     Austrailia 
## [16] NewZealand  Chaina     
## 17 Levels: Austrailia Bangladesh Canada Chaina Denmark England ... USA
# gsub(pattern = " ", replacement = "", x = d$Nation);          #특정 패턴의 문자를 치환

d$Group <- ifelse(d$LandArea > 500000, "Group_A", "Group_B");    # add new column 
#d[d$PopDensity > 500, ]$PopDensity <- 500                      # change column value.

table(d$PopDensity)
## 
##   3  15  20  31 110 126 140 176 193 232 246 339 392 485 632 985 
##   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
counts <- table(d$Group);counts
## 
## Group_A Group_B 
##       5      12
barplot(counts, main="Land Area Group", xlab="500,000(km2) break")

bp <- barplot(d$PopDensity, names.arg = d$Nation, main="Popultion Density", col="lightcyan", ylim=c(0,1000), cex.names=0.7, las=2)
text(x = bp, y = d$PopDensity*0.9, labels = d$PopDensity, col = "red", cex = 0.7)

# Sort : PopDensity
data <- d[order(d$PopDensity, decreasing = T), ]  
data
##         Nation LandArea PopDensity   Group
## 2   Bangladesh   144000        985 Group_B
## 3       Taiwan    36190        632 Group_B
## 1        Korea    99828        485 Group_B
## 11 Netherlands    41530        392 Group_B
## 4        Japan   377910        339 Group_B
## 14     England   243610        246 Group_B
## 9      Germany   357050        232 Group_B
## 10       Italy   301340        193 Group_B
## 13       Swiss    41280        176 Group_B
## 17      Chaina  9598088        140 Group_A
## 7      Denmark    43090        126 Group_B
## 8       France   551500        110 Group_A
## 5          USA  9632030         31 Group_A
## 12      Sweden   450290         20 Group_B
## 16  NewZealand   267710         15 Group_B
## 6       Canada  9984670          3 Group_A
## 15  Austrailia  7741220          3 Group_A
bp <- barplot(data$PopDensity, names.arg = data$Nation, main="Popultion Density", col="lightcyan", ylim=c(0,1000), cex.names=0.7, las=2)
text(x = bp, y = data$PopDensity*0.9, labels = data$PopDensity, col = "red", cex = 0.7)

# 1-2. 대중교통 수단별 이용 현황

d <- read.csv("data/usage_public_transportation.csv",sep=",", stringsAsFactors = FALSE)

names(d)
##  [1] "YEAR" "SEP"  "LINE" "M01"  "M02"  "M03"  "M04"  "M05"  "M06"  "M07" 
## [11] "M08"  "M09"  "M10"  "M11"  "M12"
dim(d)
## [1]  9 15
d[order (d$LINE),]
##   YEAR    SEP  LINE      M01      M02      M03      M04      M05      M06
## 1 2012 subway LINE1  8341518  8255019  8830598  8306279  8856979  8208757
## 2 2012 subway LINE2 41414219 42585260 46188844 43828003 46251012 43411868
## 3 2012 subway LINE3 14661057 15119468 16547899 15623045 16595063 15635350
## 4 2012 subway LINE4 16402892 16762228 18931723 17958485 19240358 17674972
## 5 2012 subway LINE5 15534075 16002917 17586360 17084626 17854814 16618881
## 6 2012 subway LINE6  8409574  8699279 10071527  9644924 10368294  9505216
## 7 2012 subway LINE7 16873347 17260174 19304031 18509664 19527496 18149425
## 8 2012 subway LINE8  4294660  4394818  4917996  4729974  4945049  4577395
## 9 2012 subway LINE9  5810404  5940340  6471277  6536712  6600419  6337783
##        M07      M08      M09      M10      M11      M12
## 1  8387807  7879764  8145123  8645140  8827242  8605526
## 2 43857337 41621530 42609915 43897523 45093740 44722411
## 3 15707681 14685419 15346680 16089309 16384142 16265496
## 4 17218709 15850045 16850098 18155851 18494284 18242950
## 5 16558607 15480280 16041029 17104350 17352632 17250110
## 6  9223713  8666136  9343660 10002422 10010472  9596428
## 7 17872236 16833751 17708993 18880841 20721677 20374095
## 8  4485859  4207592  4456877  4651580  4746982  4794026
## 9  6491018  6177376  6359540  6801879  6853405  6814260
unitNumber <- 1000000


# Vertical Barplot

bp <- barplot(d$M01/unitNumber, ylim=c(0,50),
              names.arg = d$LINE, main = "Seoul Subway Usage per Line(January)",
              col = "lightcyan", cex.names=0.7, las = 3, ylab="people(/million)", xlab="<Line>")

text(x=bp, y=d$M01/unitNumber*0.95, labels = round(d$M01/unitNumber, 2), col = "red", cex = 0.7)

# Horizontal Barplot

bp <- barplot(d$M01/unitNumber, xlim=c(0,50),
              names.arg = d$LINE, main = "Seoul Subway Usage per Line(January)",
              col = "gray", cex.names=0.7, las = 1, xlab="people(/million)", ylab="<Line>",
              horiz=T)

text(y=bp, x=d$M01/unitNumber + 2, labels=round(d$M01/unitNumber,2), col = "red", cex = 0.7)

# 3. Line Chart : plot / type="o"
rm(list=ls())

# csv : fileEncoding="CP949"
f <- read.csv("data/passengers_line2_station.csv", sep=",", stringsAsFactors = FALSE, fileEncoding="CP949")
head(f)
##   rank  station  y2013  y2012  y2011
## 1    1     강남 214355 207475 206712
## 2    2     잠실 150136 148506 149580
## 3    3     신림 146107 145983 147919
## 4    4 홍대입구 137019 126994 123841
## 5    5   신도림 130389 125671 117425
## 6    6     삼성 128295 138099 143675
a <- f$y2011
b <- f$y2012
c <- f$y2013

l <- head(f$station, 25)
x <- head(f$y2011, 25)
y <- head(f$y2012, 25)
z <- head(f$y2013, 25)

plot(x, xlab="", ylab="", ylim=c(0,250000), axes=FALSE, type="o", col="red", main="2호선역 일평균 승객수(명)")
axis(1, at=1:length(l), lab=l, las=2)
axis(2, las=1)

abline(h=c(50000,100000,150000,200000,250000), v=c(1:25), lty=2)   # grid line

lines(y, col="blue", type="o")       
lines(z, col="green", type="o")

colors <- c("red","blue","green")
legend(5, 220000, c("2011년","2012년","2013년"), cex=0.8, col=colors, lty=1, lwd=2)   # 범례 위치와 색

# 4. Barplot : as.matrix
gangnam <- read.csv("data/line2_gangnam.csv")
gangnam
##   T05.06 T06.07 T07.08 T08.09 T09.10 T10.11 T11.12 T12.13 T13.14 T14.15
## 1  15916  29785  59817  88993  71391  73463  95645 111468 145307 141569
## 2  13122  90820 216887 412832 304197 157500 134063 127664 153040 149648
##   T15.16 T16.17 T17.18 T18.19 T19.20 T20.21 T21.22 T22.23 T23.24 T24.01
## 1 162306 174384 228344 343805 265809 209983 258682 284400 130027  19676
## 2 141022 151285 187498 270297 240648 127347  85620  73533  50950  17279
mode(gangnam)
## [1] "list"
as.matrix(gangnam)
##      T05.06 T06.07 T07.08 T08.09 T09.10 T10.11 T11.12 T12.13 T13.14 T14.15
## [1,]  15916  29785  59817  88993  71391  73463  95645 111468 145307 141569
## [2,]  13122  90820 216887 412832 304197 157500 134063 127664 153040 149648
##      T15.16 T16.17 T17.18 T18.19 T19.20 T20.21 T21.22 T22.23 T23.24 T24.01
## [1,] 162306 174384 228344 343805 265809 209983 258682 284400 130027  19676
## [2,] 141022 151285 187498 270297 240648 127347  85620  73533  50950  17279
barplot(as.matrix(gangnam)/1000, main="강남역 시간대별 승하차 현황",
        ylab="인원수(천명)", beside=TRUE, las=2, ylim=c(0,500))

#abline(h=seq(3,400,10), col="white", lwd=2)
abline(h=c(50,100,150,200,250,300,350,400,450),lty=2)

legend("topright", c("승차","하차"), cex=0.8, fill=c("black","white"))

# 5. Line Chart : plot / type="o"
f <- read.csv("data/passengers_seoulmetro_line2_total.csv", sep=",", stringsAsFactors = FALSE, fileEncoding="CP949")
f
##       station   getIn  getOut
## 1       시 청  780471  865364
## 2    을지입구 1425862 1438643
## 3       을지3  449109  480325
## 4     을지4가  411232  419762
## 5        동운  623871  707384
## 6        신당  442928  463831
## 7    상왕십리  352160  332642
## 8      왕십리  495261  419840
## 9      한양대  522266  599709
## 10      뚝 섬  532206  544641
## 11      성 수  642477  707396
## 12      건 대 1428078 1520336
## 13      구 의  780889  762523
## 14      강 변 1958187 1900567
## 15      성 내  499059  480137
## 16      잠 실 2491504 2255165
## 17      신 천  665070  619381
## 18 종합운동장  605066  661105
## 19      삼 성 2153134 2247358
## 20      선 릉 1893532 1814455
## 21      역 삼 1465665 1631149
## 22      강 남 2910770 3105252
## 23       교대 1272399 1404885
## 24      서 초  536997  570435
## 25      방 배  676591  683174
## 26       사당 1289718 1414463
## 27     낙성대  950148  911596
## 28     서울대 1617695 1624029
## 29      봉 천  776388  696755
## 30      신 림 2317361 2211588
## 31     신대방  757285  742847
## 32   구로공단 1730707 1756161
## 33      대 림 1020034 1041978
## 34     신도림 1560014 1495022
## 35      문 래  574743  569716
## 36   영등포구  667523  670321
## 37      당 산 1247059 1189170
## 38      합 정  846368  744507
## 39      홍 대 1628211 1718136
## 40      신 촌 1710438 1842452
## 41      이 대  830310  844680
## 42      아 현  429476  405023
## 43     충정로  352046  375926
## 44      용 답   89195   95077
## 45      신 답   64567   71482
## 46     신설동  156180  146884
## 47     도림천   19925   24224
## 48   양천구청  268598  274738
## 49   신정네거  414133  408989
## 50     용두역   63309   74553
getIn <- (f$getIn/10000)
getOut <- (f$getOut/10000)

yrange <- range(0, getIn, getOut)     # 최저값과 최고값 사이의 범위
yrange
## [1]   0.0000 310.5252
plot(getIn, xlab="", ylab="", ylim=yrange, axes=FALSE, type="o", col="red", main="2호선 역별 승하차인원(만명)")

axis(1, at=1:50, lab=c(f$station), las=2)
axis(2,las=1)

abline(h=c(50,100,150,200,250,300), v=c(5,10,15,20,25,30,35,40,45),lty=2)

lines(getOut, col="blue", type="o")

colors <- c("red","blue")
legend(35,300,c("승차","하차"),cex=0.8,col=colors,lty=1,lwd=2)

# 6.

noodle <- read.csv("data/inflation_ramyon_rate.csv", header=T, sep=",", fileEncoding="CP949")

# ann : x, y 축 타이틀 표시 여부

plot(noodle$년도, noodle$누적물가상승율, type="o", ylim=c(-3,1200), ann=FALSE, col="red", lwd=2)
par(new=T)                          # 그래프를 겹쳐 그린다는 의미
plot(noodle$년도, noodle$누적상승율, type="o", ylim=c(-3,1200), axes=FALSE, ann=FALSE, col="blue", lwd=2)

title(main="물가상승률 및 라면값 상승율 비교")
title(xlab="년도", col.lab="blue")
title(ylab="누적상승율(단위:%)", col.lab="red")

abline(h=seq(50,1200,50), col="gray", lty=2, lwd=0.5)
abline(v=seq(1980,2015,1), col="gray", lty=2, lwd=0.5)

colors <- c("red","blue")
legend(1982, 1150, c("물가상승율","라면값상승율"), cex=0.8, col=colors, lty=1, lwd=2, fill="white", bg="white")