Titanic

titanic <- read.csv("data/titanic.csv", stringsAsFactors = F)
head(titanic)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked
## 1     0        A/5 21171  7.2500              S
## 2     0         PC 17599 71.2833   C85        C
## 3     0 STON/O2. 3101282  7.9250              S
## 4     0           113803 53.1000  C123        S
## 5     0           373450  8.0500              S
## 6     0           330877  8.4583              Q
summary(titanic)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
dim(titanic)
## [1] 891  12
titanic$Survived = factor(titanic$Survived)
titanic$Pclass = factor(titanic$Pclass)
titanic$Sex = factor(titanic$Sex)

library(ggplot2)

# 승객 연령분포
df = titanic[!is.na(titanic$Age), ]
ggplot(df) + geom_histogram(aes(x = Age), binwidth = 5)

ggplot(df) + geom_density(aes(x = Age), fill = "pink")

# 생존자 객실등급 분포
df = titanic[titanic$Survived == 1, ]

x <- ggplot(df, aes(Pclass))
x + geom_bar(aes(fill = Sex))

x + geom_bar(aes(fill = Sex), position = "dodge")

# Infer gender from name
# male : 577

is_man = grepl(", Mr\\.", titanic$Name)
sum(is_man)
## [1] 517
# .* : any character --> last name, first name  -->  , (.*?)\\.  부분을 검색
# \\1 : 괄호 안의 조건에 맞는 문자열로 전체를 치환.

titles = unique(gsub("^.*, (.*?)\\..*$", "\\1", titanic$Name))   
titles
##  [1] "Mr"           "Mrs"          "Miss"         "Master"      
##  [5] "Don"          "Rev"          "Dr"           "Mme"         
##  [9] "Ms"           "Major"        "Lady"         "Sir"         
## [13] "Mlle"         "Col"          "Capt"         "the Countess"
## [17] "Jonkheer"
titles <- paste(",", c("Mr\\.", "Master", "Don", "Rev", "Dr\\.", "Major", "Sir", "Col", "Capt", "Jonkheer"))

is_man = sapply(titles, grepl, titanic$Name)
sum(is_man)
## [1] 578
# 호칭을 제외한 순수 이름만 추출.
convert_name <- function(name) {
    if (grepl("\\(.*?\\)", name)) {
        # 괄호 안의 이름을 가져옴
        gsub("^.*?\\((.*?)\\)$", "\\1", name)
    } else {
        # 성과 이름의 순서를 바꿈
        gsub("^(.*?),\\s[a-zA-Z\\.]*?\\s(.*?)$", "\\2 \\1", name)
    }
}

clean_names = vapply(titanic$Name, FUN = convert_name, FUN.VALUE = character(1), USE.NAMES = FALSE)
head(clean_names)
## [1] "Owen Harris Braund"     "Florence Briggs Thayer"
## [3] "Laina Heikkinen"        "Lily May Peel"         
## [5] "William Henry Allen"    "James Moran"
head(titanic$Name)
## [1] "Braund, Mr. Owen Harris"                            
## [2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
## [3] "Heikkinen, Miss. Laina"                             
## [4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"       
## [5] "Allen, Mr. William Henry"                           
## [6] "Moran, Mr. James"