library(MVA)
demo("Ch-MVA")
par(mfcol=c(1,1))
# 공분산, 상관계수, 거리
# 데이터 내의 어떤 구조나 패턴이 변수들 사이의 관계에 의해서, 또는 다른 개체들과의 상대적인 거리에 의해서 나타날 수 있을까?

# 두 확률변수의 선형 의존성에 대한 측도 = 상관계수(correlation coefficient)
# 두 변수 사이의 관계가 선형이 아니라면 상관계수는 오해를 줄 수 있다.

measure
##    chest waist hips gender
## 1     34    30   32   male
## 2     37    32   37   male
## 3     38    30   36   male
## 4     36    33   39   male
## 5     38    29   33   male
## 6     43    32   38   male
## 7     40    33   42   male
## 8     38    30   40   male
## 9     40    30   37   male
## 10    41    32   39   male
## 11    36    24   35 female
## 12    36    25   37 female
## 13    34    24   37 female
## 14    33    22   34 female
## 15    36    26   38 female
## 16    37    26   37 female
## 17    34    25   38 female
## 18    36    26   37 female
## 19    38    28   40 female
## 20    35    23   35 female
summary(measure)
##      chest           waist           hips          gender  
##  Min.   :33.00   Min.   :22.0   Min.   :32.00   male  :10  
##  1st Qu.:35.75   1st Qu.:25.0   1st Qu.:35.75   female:10  
##  Median :36.50   Median :28.5   Median :37.00              
##  Mean   :37.00   Mean   :28.0   Mean   :37.05              
##  3rd Qu.:38.00   3rd Qu.:30.5   3rd Qu.:38.25              
##  Max.   :43.00   Max.   :33.0   Max.   :42.00
df <- measure[, 1:3]
male <- measure[measure$gender == "male", 1:3]
female <- measure[measure$gender == "female", 1:3]

cor(df)
##           chest     waist      hips
## chest 1.0000000 0.6987336 0.4778004
## waist 0.6987336 1.0000000 0.4147413
## hips  0.4778004 0.4147413 1.0000000
cor(male)
##           chest     waist      hips
## chest 1.0000000 0.2513682 0.4976828
## waist 0.2513682 1.0000000 0.6947857
## hips  0.4976828 0.6947857 1.0000000
cor(female)
##           chest     waist      hips
## chest 1.0000000 0.8303889 0.5885679
## waist 0.8303889 1.0000000 0.9101668
## hips  0.5885679 0.9101668 1.0000000
library(psych)
pairs.panels(male)

pairs.panels(female)

# 그러므로 데이터내의 개체들 간의 거리(distance) 개념이 중요.
# 일반적으로 유클리드 거리 사용
# 변수들은 서로 다른 척도(단위)를 갖기 때문에 먼저 표준화를 한 후 거리를 계산.

scale_measure <- scale(df, center = F)
scale_measure <- scale(df)              # 표준화 : 평균 0, 표준편차 1 로 변환

scale_measure
##            chest      waist        hips
##  [1,] -1.1649647  0.5650909 -2.07121442
##  [2,]  0.0000000  1.1301818 -0.02050707
##  [3,]  0.3883216  0.5650909 -0.43064854
##  [4,] -0.3883216  1.4127273  0.79977587
##  [5,]  0.3883216  0.2825455 -1.66107295
##  [6,]  2.3299295  1.1301818  0.38963440
##  [7,]  1.1649647  1.4127273  2.03020027
##  [8,]  0.3883216  0.5650909  1.20991733
##  [9,]  1.1649647  0.5650909 -0.02050707
## [10,]  1.5532863  1.1301818  0.79977587
## [11,] -0.3883216 -1.1301818 -0.84079001
## [12,] -0.3883216 -0.8476364 -0.02050707
## [13,] -1.1649647 -1.1301818 -0.02050707
## [14,] -1.5532863 -1.6952727 -1.25093148
## [15,] -0.3883216 -0.5650909  0.38963440
## [16,]  0.0000000 -0.5650909 -0.02050707
## [17,] -1.1649647 -0.8476364  0.38963440
## [18,] -0.3883216 -0.5650909 -0.02050707
## [19,]  0.3883216  0.0000000  1.20991733
## [20,] -0.7766432 -1.4127273 -0.84079001
## attr(,"scaled:center")
## chest waist  hips 
## 37.00 28.00 37.05 
## attr(,"scaled:scale")
##    chest    waist     hips 
## 2.575185 3.539254 2.438183
summary(scale_measure)
##      chest             waist              hips         
##  Min.   :-1.5533   Min.   :-1.6953   Min.   :-2.07121  
##  1st Qu.:-0.4854   1st Qu.:-0.8476   1st Qu.:-0.53318  
##  Median :-0.1942   Median : 0.1413   Median :-0.02051  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.3883   3rd Qu.: 0.7064   3rd Qu.: 0.49217  
##  Max.   : 2.3299   Max.   : 1.4127   Max.   : 2.03020
pairs.panels(scale_measure)

round(dist(scale_measure), 2)   # 거리 행렬 만들기
##       1    2    3    4    5    6    7    8    9   10   11   12   13   14
## 2  2.43                                                                 
## 3  2.26 0.80                                                            
## 4  3.09 0.95 1.68                                                       
## 5  1.63 1.89 1.26 2.82                                                  
## 6  4.31 2.37 2.18 2.76 2.95                                             
## 7  4.79 2.38 2.72 1.98 3.94 2.03                                        
## 8  3.63 1.41 1.64 1.22 2.88 2.18 1.41                                   
## 9  3.10 1.29 0.88 1.95 1.84 1.36 2.22 1.46                              
## 10 3.99 1.76 1.79 1.96 2.85 0.88 1.32 1.36 1.07                         
## 11 2.23 2.44 1.91 3.03 1.81 3.74 4.14 2.77 2.44 3.40                    
## 12 2.61 2.02 1.66 2.40 2.14 3.39 3.42 2.03 2.10 2.89 0.87               
## 13 2.66 2.54 2.34 2.78 2.66 4.18 4.01 2.61 2.88 3.63 1.13 0.83          
## 14 2.44 3.45 3.09 3.90 2.80 5.07 5.27 3.86 3.74 4.67 1.36 1.89 1.41     
## 15 2.82 1.79 1.60 2.02 2.35 3.20 3.00 1.60 1.96 2.61 1.35 0.50 1.04 2.31
## 16 2.62 1.70 1.26 2.18 1.89 2.91 3.08 1.72 1.62 2.44 1.07 0.48 1.29 2.28
## 17 2.84 2.33 2.25 2.43 2.81 4.02 3.64 2.25 2.76 3.39 1.48 0.88 0.50 1.89
## 18 2.47 1.74 1.43 2.14 2.00 3.23 3.24 1.84 1.92 2.70 1.00 0.28 0.96 2.04
## 19 3.67 1.72 1.74 1.66 2.88 2.39 1.81 0.57 1.56 1.67 2.47 1.68 2.28 3.56
## 20 2.36 2.78 2.33 3.29 2.21 4.20 4.47 3.08 2.89 3.82 0.48 1.07 0.95 0.92
##      15   16   17   18   19
## 16 0.56
## 17 0.83 1.27               
## 18 0.41 0.39 0.92          
## 19 1.26 1.41 1.95 1.56     
## 20 1.54 1.41 1.41 1.24 2.75
# outlier 찾기
df <- USairpollution

head(df)
##             SO2 temp manu popul wind precip predays
## Albany       46 47.6   44   116  8.8  33.36     135
## Albuquerque  11 56.8   46   244  8.9   7.77      58
## Atlanta      24 61.5  368   497  9.1  48.34     115
## Baltimore    47 55.0  625   905  9.6  41.31     111
## Buffalo      11 47.1  391   463 12.4  36.11     166
## Charleston   31 55.2   35    71  6.5  40.75     148
pairs.panels(df)

library(DMwR)
## Loading required package: grid
outlier.score <- lofactor(df, k = 5)
outlier.score
##  [1] 1.0185135 0.9947553 0.9767661 1.0744590 1.0025476 1.1018627 6.7308161
##  [8] 1.0631965 1.4999312 1.0524985 1.0591652 1.0515998 0.9684696 2.4217025
## [15] 2.1712077 1.8285197 1.1545934 1.3550799 1.0211214 1.0560957 1.1030826
## [22] 1.1598072 1.0161363 1.0511851 1.0705834 1.1054748 1.0158912 0.9708327
## [29] 0.9904630 3.6374196 1.2199493 0.9784709 1.8761657 0.9828882 0.9796012
## [36] 1.2746045 0.9450678 1.1737022 1.2568747 1.0110542 1.0315732
plot(density(outlier.score), main = "outlier score")

sort(outlier.score, decreasing = T)[1:10]
##  [1] 6.730816 3.637420 2.421702 2.171208 1.876166 1.828520 1.499931
##  [8] 1.355080 1.274604 1.256875
outliers <- order(outlier.score, decreasing = T)[1:3]  # score > 2.2 이면 outlier로 설정.
outliers
## [1]  7 30 14
df[outliers, ]
##              SO2 temp manu popul wind precip predays
## Chicago      110 50.6 3344  3369 10.4  34.44     122
## Philadelphia  69 54.6 1692  1950  9.6  39.93     115
## Detroit       35 49.9 1064  1513 10.1  30.96     129
# outlier 제거한 데이터의 상관계수
pairs.panels(df[-outliers, ])

# 다변량 데이터의 시각화
attach(USairpollution)
head(USairpollution)
##             SO2 temp manu popul wind precip predays
## Albany       46 47.6   44   116  8.8  33.36     135
## Albuquerque  11 56.8   46   244  8.9   7.77      58
## Atlanta      24 61.5  368   497  9.1  48.34     115
## Baltimore    47 55.0  625   905  9.6  41.31     111
## Buffalo      11 47.1  391   463 12.4  36.11     166
## Charleston   31 55.2   35    71  6.5  40.75     148
# 1. 산점도 scatterplot

plot(popul ~ manu, data = USairpollution)

# 1-1. Bivariate boxplot

# 영역을 3개로 분할
layer <- matrix(c(2,0,1,3), nrow = 2, byrow = T)
layout(layer, widths = c(2,1), heights = c(1,2), respect = T)
# 각 영역에 순서대로 표시
xlim <- range(manu) * 1.1
plot(popul ~ manu, data = USairpollution, cex = 0.9, type = "n", xlim = xlim)
text(manu, popul, cex = 0.6, labels = row.names(USairpollution))
hist(manu, main = "", xlim = xlim)
boxplot(popul)

par(mfcol=c(1,1))

# outlier 있고 없고 상관계수 비교
df <- USairpollution
rownames(df) <- rep(1:nrow(df))
plot(popul ~ manu, data = df, cex = 0.9, type = "n")
text(manu, popul, cex = 0.9, labels = rownames(df))

outlier <- c(7, 30, 14)     # 시카고, 필라델피아, 디트로이트
cor(df$manu, df$popul)                      # 0.955
## [1] 0.9552693
cor(df$manu[-outlier], df$popul[-outlier])  # 0.769 : 상관계수가 오히려 줄었다.
## [1] 0.7698125
# 2. boxplot

head(chickwts)   # 닭의 성장률에 대한 다양한 사료 보충제의 효과를 측정
##    weight      feed
## 1     179 horsebean
## 2     160 horsebean
## 3     136 horsebean
## 4     227 horsebean
## 5     217 horsebean
summary(chickwts)
##      weight             feed   
##  Min.   :108.0   casein   :12  
##  1st Qu.:204.5   horsebean:10  
##  Median :258.0   linseed  :12  
##  Mean   :261.3   meatmeal :11  
##  3rd Qu.:323.5   soybean  :14  
##  Max.   :423.0   sunflower:12
boxplot(weight ~ feed, chickwts)

# casein & horsebean : 평균과 범위가 서로 많이 차이나기 때문에 독립적이라고 할 수 있다.
# linseed & meatmeal : 평균은 차이가 나지만 범위가 비슷하기 때문에 독립적이라고 하기 어렵다.
# 3. Bubble chart - 변수 3개 표현

# USairpollution - SO2를 원의 크기로 표현
symbols(temp, wind, circles = SO2, inches = 0.5)
legend(70, 13, "circle : SO2")

# 크기 2배 -> 면적은 4배
# sqrt 변환 : 크기가 비슷해져 변별력이 떨어진다.
symbols(temp, wind, circles = sqrt(SO2), inches = 0.5) 

ylim <- range(wind) * c(0.95, 1)
plot(wind ~ temp, data = USairpollution, pch = 10, ylim = ylim)
symbols(temp, wind, circles = SO2, inches = 0.5, add = T)

# 4. mosaic plot

UCBAdmissions   # 1973년 버클리 대학원 지원자에 대한 집계 데이터.
## , , Dept = A
## 
##           Gender
## Admit      Male Female
##   Admitted  512     89
##   Rejected  313     19
## 
## , , Dept = B
## 
##           Gender
## Admit      Male Female
##   Admitted  353     17
##   Rejected  207      8
## 
## , , Dept = C
## 
##           Gender
## Admit      Male Female
##   Admitted  120    202
##   Rejected  205    391
## 
## , , Dept = D
## 
##           Gender
## Admit      Male Female
##   Admitted  138    131
##   Rejected  279    244
## 
## , , Dept = E
## 
##           Gender
## Admit      Male Female
##   Admitted   53     94
##   Rejected  138    299
## 
## , , Dept = F
## 
##           Gender
## Admit      Male Female
##   Admitted   22     24
##   Rejected  351    317
adm <- UCBAdmissions
adm[,,1]
##           Gender
## Admit      Male Female
##   Admitted  512     89
##   Rejected  313     19
adm[,1,]
##           Dept
## Admit        A   B   C   D   E   F
##   Admitted 512 353 120 138  53  22
##   Rejected 313 207 205 279 138 351
mosaicplot(~ Dept + Gender, data = adm, color = T)

mosaicplot(~ Gender + Dept, data = adm, color = T)

library(reshape)
ucb <- melt(UCBAdmissions)
head(ucb)
##      Admit Gender Dept value
## 1 Admitted   Male    A   512
## 2 Rejected   Male    A   313
## 3 Admitted Female    A    89
## 4 Rejected Female    A    19
## 5 Admitted   Male    B   353
## 6 Rejected   Male    B   207
df <- xtabs(value ~ Dept + Gender, data = ucb)
mosaicplot(~ Dept + Gender, data = df, color = T)

df2 <- xtabs(value ~ Dept + Admit, data = ucb)
mosaicplot(~ Dept + Admit, data = df2, color = T)

# 5. star plot

stars(USairpollution)

stars(USairpollution, key.loc = c(15, 2), cex = 0.8) # show legend

stars(USairpollution, key.loc = c(15, 2), cex = 0.8, draw.segments = T)

# 6. Heat map : 군집분석을 함께 수행한다. 

bball <- read.csv("http://datasets.flowingdata.com/ppg2008.csv")
head(bball)
##             Name  G  MIN  PTS  FGM  FGA   FGP FTM FTA   FTP X3PM X3PA
## 1   Dwyane Wade  79 38.6 30.2 10.8 22.0 0.491 7.5 9.8 0.765  1.1  3.5
## 2  LeBron James  81 37.7 28.4  9.7 19.9 0.489 7.3 9.4 0.780  1.6  4.7
## 3   Kobe Bryant  82 36.2 26.8  9.8 20.9 0.467 5.9 6.9 0.856  1.4  4.1
## 4 Dirk Nowitzki  81 37.7 25.9  9.6 20.0 0.479 6.0 6.7 0.890  0.8  2.1
## 5 Danny Granger  67 36.2 25.8  8.5 19.1 0.447 6.0 6.9 0.878  2.7  6.7
## 6  Kevin Durant  74 39.0 25.3  8.9 18.8 0.476 6.1 7.1 0.863  1.3  3.1
##    X3PP ORB DRB TRB AST STL BLK  TO  PF
## 1 0.317 1.1 3.9 5.0 7.5 2.2 1.3 3.4 2.3
## 2 0.344 1.3 6.3 7.6 7.2 1.7 1.1 3.0 1.7
## 3 0.351 1.1 4.1 5.2 4.9 1.5 0.5 2.6 2.3
## 4 0.359 1.1 7.3 8.4 2.4 0.8 0.8 1.9 2.2
## 5 0.404 0.7 4.4 5.1 2.7 1.0 1.4 2.5 3.1
## 6 0.422 1.0 5.5 6.5 2.8 1.3 0.7 3.0 1.8
rownames(bball) <- bball$Name
bball <- bball[ , -1]
bball <- as.matrix(bball)

heatmap(bball) # 밝은 색이 높은 값. 변수별 단위의 차이가 있기 때문에 scaling 필요.

heatmap(bball, scale = "column", Colv = NA) 

# 7. 이변량자료 분포 시각화 & 군집화

library(MASS)
attach(geyser)
head(geyser)
##   waiting duration
## 1      80 4.016667
## 2      71 2.150000
## 3      57 4.000000
## 4      80 4.000000
## 5      75 4.000000
## 6      77 2.000000
dim(geyser)
## [1] 299   2
density1 <- kde2d(waiting, duration, n = 25)
image(density1, xlab = "waiting", ylab = "duration")

str(density1)
## List of 3
##  $ x: num [1:25] 43 45.7 48.4 51.1 53.8 ...
##  $ y: num [1:25] 0.833 1.026 1.218 1.41 1.603 ...
##  $ z: num [1:25, 1:25] 9.07e-13 1.95e-11 3.10e-10 3.66e-09 3.26e-08 ...
density2 <- kde2d(waiting, duration, n = 100)
image(density2, xlab = "waiting", ylab = "duration")

# 등고선
contour(density2)

# practice
# 2005년 미국의 범죄율 데이터
# 인구 100,000명 중의 발생 비율

data <- read.csv("data/crime.csv", header = T)
head(data, 10)
##                   state murder forcible_rape robbery aggravated_assault
## 1         United States    5.6          31.7   140.7              291.1
## 2               Alabama    8.2          34.3   141.4              247.8
## 3                Alaska    4.8          81.1    80.9              465.1
## 4               Arizona    7.5          33.8   144.4              327.4
## 5              Arkansas    6.7          42.9    91.1              386.8
## 6            California    6.9          26.0   176.1              317.3
## 7              Colorado    3.7          43.4    84.6              264.7
## 8           Connecticut    2.9          20.0   113.0              138.6
## 9              Delaware    4.4          44.7   154.8              428.2
## 10 District of Columbia   35.4          30.2   672.1              721.3
##    burglary larceny_theft motor_vehicle_theft population
## 1     726.7        2286.3               416.7  295753151
## 2     953.8        2650.0               288.3    4545049
## 3     622.5        2599.1               391.0     669488
## 4     948.4        2965.2               924.4    5974834
## 5    1084.6        2711.2               262.1    2776221
## 6     693.3        1916.5               712.8   35795255
## 7     744.8        2735.2               559.5    4660780
## 8     437.1        1824.1               296.8    3477416
## 9     688.9        2144.0               278.5     839906
## 10    649.7        2694.9              1402.3     582049
crime <- data[-1, ]                     # United States (미국 전체) 제거
rownames(crime) <- c(1:nrow(crime))
head(crime, 10)
##                   state murder forcible_rape robbery aggravated_assault
## 1               Alabama    8.2          34.3   141.4              247.8
## 2                Alaska    4.8          81.1    80.9              465.1
## 3               Arizona    7.5          33.8   144.4              327.4
## 4              Arkansas    6.7          42.9    91.1              386.8
## 5            California    6.9          26.0   176.1              317.3
## 6              Colorado    3.7          43.4    84.6              264.7
## 7           Connecticut    2.9          20.0   113.0              138.6
## 8              Delaware    4.4          44.7   154.8              428.2
## 9  District of Columbia   35.4          30.2   672.1              721.3
## 10              Florida    5.0          37.1   169.4              496.6
##    burglary larceny_theft motor_vehicle_theft population
## 1     953.8        2650.0               288.3    4545049
## 2     622.5        2599.1               391.0     669488
## 3     948.4        2965.2               924.4    5974834
## 4    1084.6        2711.2               262.1    2776221
## 5     693.3        1916.5               712.8   35795255
## 6     744.8        2735.2               559.5    4660780
## 7     437.1        1824.1               296.8    3477416
## 8     688.9        2144.0               278.5     839906
## 9     649.7        2694.9              1402.3     582049
## 10    926.3        2658.3               423.3   17783868
# A. 살인(murder)와 절도(burglary) 사이의 산점도를 단변량 분포와 함께 그리시오. 
# 상관계수도 함께 살피시오.

library(psych)
pairs.panels(crime)     # cor = 0.28

cor.test(crime$murder, crime$burglary)
## 
##  Pearson's product-moment correlation
## 
## data:  crime$murder and crime$burglary
## t = 2.0125, df = 49, p-value = 0.04968
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0007878724 0.5128437338
## sample estimates:
##       cor 
## 0.2763115
# B. 위를 통해 이상점 존재여부를 판단하고 존재한다면 해당 주를 확인하고 제거하시오. 
# 제거 후 변수들 사이의 관계가 어떻게 변화하는지 살피시오.

cdata <- crime[ , c("murder", "burglary")]
head(cdata)
##   murder burglary
## 1    8.2    953.8
## 2    4.8    622.5
## 3    7.5    948.4
## 4    6.7   1084.6
## 5    6.9    693.3
## 6    3.7    744.8
# 발생비율의 단위가 다르므로 비교를 위해 scaling 한다.
boxplot(scale(cdata))   # murder 에 특이하게 높은 이상점 존재

# 사분위수를 이용한 outlier 확인
# fivenum : minimum, lower-hinge, median, upper-hinge, maximum
a <- cdata$murder
which(a > fivenum(a)[4] + 1.5*IQR(a))   # 9
## [1] 9
# 산점도를 통한 outlier 확인
cdata <- crime[ , c("state", "murder", "burglary")]
head(cdata, 10)
##                   state murder burglary
## 1               Alabama    8.2    953.8
## 2                Alaska    4.8    622.5
## 3               Arizona    7.5    948.4
## 4              Arkansas    6.7   1084.6
## 5            California    6.9    693.3
## 6              Colorado    3.7    744.8
## 7           Connecticut    2.9    437.1
## 8              Delaware    4.4    688.9
## 9  District of Columbia   35.4    649.7
## 10              Florida    5.0    926.3
plot(burglary ~ murder, data = cdata, cex = 0.8, type = "n")
text(cdata$murder, cdata$burglary, cex = 0.8, labels = rownames(cdata))   # outlier = 9

outlier = 9
cdata[outlier, c("state", "murder", "burglary")]   # District of Columbia 
##                  state murder burglary
## 9 District of Columbia   35.4    649.7
# outlier 제거 후 상관계수 확인
cdata <- cdata[-outlier, ]
head(cdata, 10)
##          state murder burglary
## 1      Alabama    8.2    953.8
## 2       Alaska    4.8    622.5
## 3      Arizona    7.5    948.4
## 4     Arkansas    6.7   1084.6
## 5   California    6.9    693.3
## 6     Colorado    3.7    744.8
## 7  Connecticut    2.9    437.1
## 8     Delaware    4.4    688.9
## 10     Florida    5.0    926.3
## 11     Georgia    6.2    931.0
cor.test(cdata$murder, cdata$burglary)
## 
##  Pearson's product-moment correlation
## 
## data:  cdata$murder and cdata$burglary
## t = 5.5205, df = 48, p-value = 1.342e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4171943 0.7682612
## sample estimates:
##       cor 
## 0.6231757
pairs.panels(cdata[ , c(2:3)])          # cor = 0.62 상관계수 증가

# C. 살인, 절도와 인구(population)의 관계를 함께 관찰하기 위해 bubble plot을 그리고 
# 관찰한 사실을 기술하시오.

# 원본
symbols(crime$murder, crime$burglary, circles = crime$population, inches = 0.5)

# outlier 제거시
cdata <- crime[-outlier, c("state", "murder", "burglary", "population")]
symbols(cdata$murder, cdata$burglary, circles = cdata$population, inches = 0.5)

# 살인과 절도 발생률은 어느 정도 상관관계가 있으나, 인구와는 상관관계가 없다.
# D. 7가지 범죄의 발생 건수를 heatmap, 별그림, 나이팅게일 차트로 표현하고 
# 범죄 발생 특징 간의 패턴이 비슷한 주들이 있는지 살피시오.

cdata2 <- crime
rownames(cdata2) <- cdata2$state
cdata2 <- cdata2[ , -c(1,9)]     # state를 rowname으로 변경. 인구 변수 제거.
head(cdata2, 10)
##                      murder forcible_rape robbery aggravated_assault
## Alabama                 8.2          34.3   141.4              247.8
## Alaska                  4.8          81.1    80.9              465.1
## Arizona                 7.5          33.8   144.4              327.4
## Arkansas                6.7          42.9    91.1              386.8
## California              6.9          26.0   176.1              317.3
## Colorado                3.7          43.4    84.6              264.7
## Connecticut             2.9          20.0   113.0              138.6
## Delaware                4.4          44.7   154.8              428.2
## District of Columbia   35.4          30.2   672.1              721.3
## Florida                 5.0          37.1   169.4              496.6
##                      burglary larceny_theft motor_vehicle_theft
## Alabama                 953.8        2650.0               288.3
## Alaska                  622.5        2599.1               391.0
## Arizona                 948.4        2965.2               924.4
## Arkansas               1084.6        2711.2               262.1
## California              693.3        1916.5               712.8
## Colorado                744.8        2735.2               559.5
## Connecticut             437.1        1824.1               296.8
## Delaware                688.9        2144.0               278.5
## District of Columbia    649.7        2694.9              1402.3
## Florida                 926.3        2658.3               423.3
# star & nightingale - dataframe
# star
stars(cdata2, key.loc = c(11, 2), cex = 0.8, ncol = 10)

# nightingale
stars(cdata2, key.loc = c(11, 2), cex = 0.8, draw.segments = T, ncol = 10)

# heatmap - matrix
cdata2 <- as.matrix(cdata2)
heatmap(cdata2, scale = "column", Colv = NA, cexCol = 0.9, margins = c(8, 5))