정형 데이터 분석
# 1. Barplot
# 1-1. 국가별 면적과 인구밀도 비교
d <- read.csv("data/nations_land_area.csv", sep=",")
names(d)
## [1] "Nation" "LandArea" "PopDensity"
dim(d) # row count / column count
## [1] 17 3
str(d)
## 'data.frame': 17 obs. of 3 variables:
## $ Nation : Factor w/ 17 levels "Austrailia","Bangladesh",..: 11 2 16 10 17 3 5 7 8 9 ...
## $ LandArea : int 99828 144000 36190 377910 9632030 9984670 43090 551500 357050 301340 ...
## $ PopDensity: int 485 985 632 339 31 3 126 110 232 193 ...
# ordering
d[order(d$Nation), ]
## Nation LandArea PopDensity
## 15 Austrailia 7741220 3
## 2 Bangladesh 144000 985
## 6 Canada 9984670 3
## 17 Chaina 9598088 140
## 7 Denmark 43090 126
## 14 England 243610 246
## 8 France 551500 110
## 9 Germany 357050 232
## 10 Italy 301340 193
## 4 Japan 377910 339
## 1 Korea 99828 485
## 11 Netherlands 41530 392
## 16 NewZealand 267710 15
## 12 Sweden 450290 20
## 13 Swiss 41280 176
## 3 Taiwan 36190 632
## 5 USA 9632030 31
d[order(d$LandArea, decreasing = T), ]
## Nation LandArea PopDensity
## 6 Canada 9984670 3
## 5 USA 9632030 31
## 17 Chaina 9598088 140
## 15 Austrailia 7741220 3
## 8 France 551500 110
## 12 Sweden 450290 20
## 4 Japan 377910 339
## 9 Germany 357050 232
## 10 Italy 301340 193
## 16 NewZealand 267710 15
## 14 England 243610 246
## 2 Bangladesh 144000 985
## 1 Korea 99828 485
## 7 Denmark 43090 126
## 11 Netherlands 41530 392
## 13 Swiss 41280 176
## 3 Taiwan 36190 632
d$Nation
## [1] Korea Bangladesh Taiwan Japan USA
## [6] Canada Denmark France Germany Italy
## [11] Netherlands Sweden Swiss England Austrailia
## [16] NewZealand Chaina
## 17 Levels: Austrailia Bangladesh Canada Chaina Denmark England ... USA
# gsub(pattern = " ", replacement = "", x = d$Nation); #특정 패턴의 문자를 치환
d$Group <- ifelse(d$LandArea > 500000, "Group_A", "Group_B"); # add new column
#d[d$PopDensity > 500, ]$PopDensity <- 500 # change column value.
table(d$PopDensity)
##
## 3 15 20 31 110 126 140 176 193 232 246 339 392 485 632 985
## 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
counts <- table(d$Group);counts
##
## Group_A Group_B
## 5 12
barplot(counts, main="Land Area Group", xlab="500,000(km2) break")
bp <- barplot(d$PopDensity, names.arg = d$Nation, main="Popultion Density", col="lightcyan", ylim=c(0,1000), cex.names=0.7, las=2)
text(x = bp, y = d$PopDensity*0.9, labels = d$PopDensity, col = "red", cex = 0.7)
# Sort : PopDensity
data <- d[order(d$PopDensity, decreasing = T), ]
data
## Nation LandArea PopDensity Group
## 2 Bangladesh 144000 985 Group_B
## 3 Taiwan 36190 632 Group_B
## 1 Korea 99828 485 Group_B
## 11 Netherlands 41530 392 Group_B
## 4 Japan 377910 339 Group_B
## 14 England 243610 246 Group_B
## 9 Germany 357050 232 Group_B
## 10 Italy 301340 193 Group_B
## 13 Swiss 41280 176 Group_B
## 17 Chaina 9598088 140 Group_A
## 7 Denmark 43090 126 Group_B
## 8 France 551500 110 Group_A
## 5 USA 9632030 31 Group_A
## 12 Sweden 450290 20 Group_B
## 16 NewZealand 267710 15 Group_B
## 6 Canada 9984670 3 Group_A
## 15 Austrailia 7741220 3 Group_A
bp <- barplot(data$PopDensity, names.arg = data$Nation, main="Popultion Density", col="lightcyan", ylim=c(0,1000), cex.names=0.7, las=2)
text(x = bp, y = data$PopDensity*0.9, labels = data$PopDensity, col = "red", cex = 0.7)
# 1-2. 대중교통 수단별 이용 현황
d <- read.csv("data/usage_public_transportation.csv",sep=",", stringsAsFactors = FALSE)
names(d)
## [1] "YEAR" "SEP" "LINE" "M01" "M02" "M03" "M04" "M05" "M06" "M07"
## [11] "M08" "M09" "M10" "M11" "M12"
dim(d)
## [1] 9 15
d[order (d$LINE),]
## YEAR SEP LINE M01 M02 M03 M04 M05 M06
## 1 2012 subway LINE1 8341518 8255019 8830598 8306279 8856979 8208757
## 2 2012 subway LINE2 41414219 42585260 46188844 43828003 46251012 43411868
## 3 2012 subway LINE3 14661057 15119468 16547899 15623045 16595063 15635350
## 4 2012 subway LINE4 16402892 16762228 18931723 17958485 19240358 17674972
## 5 2012 subway LINE5 15534075 16002917 17586360 17084626 17854814 16618881
## 6 2012 subway LINE6 8409574 8699279 10071527 9644924 10368294 9505216
## 7 2012 subway LINE7 16873347 17260174 19304031 18509664 19527496 18149425
## 8 2012 subway LINE8 4294660 4394818 4917996 4729974 4945049 4577395
## 9 2012 subway LINE9 5810404 5940340 6471277 6536712 6600419 6337783
## M07 M08 M09 M10 M11 M12
## 1 8387807 7879764 8145123 8645140 8827242 8605526
## 2 43857337 41621530 42609915 43897523 45093740 44722411
## 3 15707681 14685419 15346680 16089309 16384142 16265496
## 4 17218709 15850045 16850098 18155851 18494284 18242950
## 5 16558607 15480280 16041029 17104350 17352632 17250110
## 6 9223713 8666136 9343660 10002422 10010472 9596428
## 7 17872236 16833751 17708993 18880841 20721677 20374095
## 8 4485859 4207592 4456877 4651580 4746982 4794026
## 9 6491018 6177376 6359540 6801879 6853405 6814260
unitNumber <- 1000000
# Vertical Barplot
bp <- barplot(d$M01/unitNumber, ylim=c(0,50),
names.arg = d$LINE, main = "Seoul Subway Usage per Line(January)",
col = "lightcyan", cex.names=0.7, las = 3, ylab="people(/million)", xlab="<Line>")
text(x=bp, y=d$M01/unitNumber*0.95, labels = round(d$M01/unitNumber, 2), col = "red", cex = 0.7)
# Horizontal Barplot
bp <- barplot(d$M01/unitNumber, xlim=c(0,50),
names.arg = d$LINE, main = "Seoul Subway Usage per Line(January)",
col = "gray", cex.names=0.7, las = 1, xlab="people(/million)", ylab="<Line>",
horiz=T)
text(y=bp, x=d$M01/unitNumber + 2, labels=round(d$M01/unitNumber,2), col = "red", cex = 0.7)
# 3. Line Chart : plot / type="o"
rm(list=ls())
# csv : fileEncoding="CP949"
f <- read.csv("data/passengers_line2_station.csv", sep=",", stringsAsFactors = FALSE, fileEncoding="CP949")
head(f)
## rank station y2013 y2012 y2011
## 1 1 강남 214355 207475 206712
## 2 2 잠실 150136 148506 149580
## 3 3 신림 146107 145983 147919
## 4 4 홍대입구 137019 126994 123841
## 5 5 신도림 130389 125671 117425
## 6 6 삼성 128295 138099 143675
a <- f$y2011
b <- f$y2012
c <- f$y2013
l <- head(f$station, 25)
x <- head(f$y2011, 25)
y <- head(f$y2012, 25)
z <- head(f$y2013, 25)
plot(x, xlab="", ylab="", ylim=c(0,250000), axes=FALSE, type="o", col="red", main="2호선역 일평균 승객수(명)")
axis(1, at=1:length(l), lab=l, las=2)
axis(2, las=1)
abline(h=c(50000,100000,150000,200000,250000), v=c(1:25), lty=2) # grid line
lines(y, col="blue", type="o")
lines(z, col="green", type="o")
colors <- c("red","blue","green")
legend(5, 220000, c("2011년","2012년","2013년"), cex=0.8, col=colors, lty=1, lwd=2) # 범례 위치와 색
# 4. Barplot : as.matrix
gangnam <- read.csv("data/line2_gangnam.csv")
gangnam
## T05.06 T06.07 T07.08 T08.09 T09.10 T10.11 T11.12 T12.13 T13.14 T14.15
## 1 15916 29785 59817 88993 71391 73463 95645 111468 145307 141569
## 2 13122 90820 216887 412832 304197 157500 134063 127664 153040 149648
## T15.16 T16.17 T17.18 T18.19 T19.20 T20.21 T21.22 T22.23 T23.24 T24.01
## 1 162306 174384 228344 343805 265809 209983 258682 284400 130027 19676
## 2 141022 151285 187498 270297 240648 127347 85620 73533 50950 17279
mode(gangnam)
## [1] "list"
as.matrix(gangnam)
## T05.06 T06.07 T07.08 T08.09 T09.10 T10.11 T11.12 T12.13 T13.14 T14.15
## [1,] 15916 29785 59817 88993 71391 73463 95645 111468 145307 141569
## [2,] 13122 90820 216887 412832 304197 157500 134063 127664 153040 149648
## T15.16 T16.17 T17.18 T18.19 T19.20 T20.21 T21.22 T22.23 T23.24 T24.01
## [1,] 162306 174384 228344 343805 265809 209983 258682 284400 130027 19676
## [2,] 141022 151285 187498 270297 240648 127347 85620 73533 50950 17279
barplot(as.matrix(gangnam)/1000, main="강남역 시간대별 승하차 현황",
ylab="인원수(천명)", beside=TRUE, las=2, ylim=c(0,500))
#abline(h=seq(3,400,10), col="white", lwd=2)
abline(h=c(50,100,150,200,250,300,350,400,450),lty=2)
legend("topright", c("승차","하차"), cex=0.8, fill=c("black","white"))
# 5. Line Chart : plot / type="o"
f <- read.csv("data/passengers_seoulmetro_line2_total.csv", sep=",", stringsAsFactors = FALSE, fileEncoding="CP949")
f
## station getIn getOut
## 1 시 청 780471 865364
## 2 을지입구 1425862 1438643
## 3 을지3 449109 480325
## 4 을지4가 411232 419762
## 5 동운 623871 707384
## 6 신당 442928 463831
## 7 상왕십리 352160 332642
## 8 왕십리 495261 419840
## 9 한양대 522266 599709
## 10 뚝 섬 532206 544641
## 11 성 수 642477 707396
## 12 건 대 1428078 1520336
## 13 구 의 780889 762523
## 14 강 변 1958187 1900567
## 15 성 내 499059 480137
## 16 잠 실 2491504 2255165
## 17 신 천 665070 619381
## 18 종합운동장 605066 661105
## 19 삼 성 2153134 2247358
## 20 선 릉 1893532 1814455
## 21 역 삼 1465665 1631149
## 22 강 남 2910770 3105252
## 23 교대 1272399 1404885
## 24 서 초 536997 570435
## 25 방 배 676591 683174
## 26 사당 1289718 1414463
## 27 낙성대 950148 911596
## 28 서울대 1617695 1624029
## 29 봉 천 776388 696755
## 30 신 림 2317361 2211588
## 31 신대방 757285 742847
## 32 구로공단 1730707 1756161
## 33 대 림 1020034 1041978
## 34 신도림 1560014 1495022
## 35 문 래 574743 569716
## 36 영등포구 667523 670321
## 37 당 산 1247059 1189170
## 38 합 정 846368 744507
## 39 홍 대 1628211 1718136
## 40 신 촌 1710438 1842452
## 41 이 대 830310 844680
## 42 아 현 429476 405023
## 43 충정로 352046 375926
## 44 용 답 89195 95077
## 45 신 답 64567 71482
## 46 신설동 156180 146884
## 47 도림천 19925 24224
## 48 양천구청 268598 274738
## 49 신정네거 414133 408989
## 50 용두역 63309 74553
getIn <- (f$getIn/10000)
getOut <- (f$getOut/10000)
yrange <- range(0, getIn, getOut) # 최저값과 최고값 사이의 범위
yrange
## [1] 0.0000 310.5252
plot(getIn, xlab="", ylab="", ylim=yrange, axes=FALSE, type="o", col="red", main="2호선 역별 승하차인원(만명)")
axis(1, at=1:50, lab=c(f$station), las=2)
axis(2,las=1)
abline(h=c(50,100,150,200,250,300), v=c(5,10,15,20,25,30,35,40,45),lty=2)
lines(getOut, col="blue", type="o")
colors <- c("red","blue")
legend(35,300,c("승차","하차"),cex=0.8,col=colors,lty=1,lwd=2)
# 6.
noodle <- read.csv("data/inflation_ramyon_rate.csv", header=T, sep=",", fileEncoding="CP949")
# ann : x, y 축 타이틀 표시 여부
plot(noodle$년도, noodle$누적물가상승율, type="o", ylim=c(-3,1200), ann=FALSE, col="red", lwd=2)
par(new=T) # 그래프를 겹쳐 그린다는 의미
plot(noodle$년도, noodle$누적상승율, type="o", ylim=c(-3,1200), axes=FALSE, ann=FALSE, col="blue", lwd=2)
title(main="물가상승률 및 라면값 상승율 비교")
title(xlab="년도", col.lab="blue")
title(ylab="누적상승율(단위:%)", col.lab="red")
abline(h=seq(50,1200,50), col="gray", lty=2, lwd=0.5)
abline(v=seq(1980,2015,1), col="gray", lty=2, lwd=0.5)
colors <- c("red","blue")
legend(1982, 1150, c("물가상승율","라면값상승율"), cex=0.8, col=colors, lty=1, lwd=2, fill="white", bg="white")