Titanic
titanic <- read.csv("data/titanic.csv", stringsAsFactors = F)
head(titanic)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
summary(titanic)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
dim(titanic)
## [1] 891 12
titanic$Survived = factor(titanic$Survived)
titanic$Pclass = factor(titanic$Pclass)
titanic$Sex = factor(titanic$Sex)
library(ggplot2)
# 승객 연령분포
df = titanic[!is.na(titanic$Age), ]
ggplot(df) + geom_histogram(aes(x = Age), binwidth = 5)
ggplot(df) + geom_density(aes(x = Age), fill = "pink")
# 생존자 객실등급 분포
df = titanic[titanic$Survived == 1, ]
x <- ggplot(df, aes(Pclass))
x + geom_bar(aes(fill = Sex))
x + geom_bar(aes(fill = Sex), position = "dodge")
# Infer gender from name
# male : 577
is_man = grepl(", Mr\\.", titanic$Name)
sum(is_man)
## [1] 517
# .* : any character --> last name, first name --> , (.*?)\\. 부분을 검색
# \\1 : 괄호 안의 조건에 맞는 문자열로 전체를 치환.
titles = unique(gsub("^.*, (.*?)\\..*$", "\\1", titanic$Name))
titles
## [1] "Mr" "Mrs" "Miss" "Master"
## [5] "Don" "Rev" "Dr" "Mme"
## [9] "Ms" "Major" "Lady" "Sir"
## [13] "Mlle" "Col" "Capt" "the Countess"
## [17] "Jonkheer"
titles <- paste(",", c("Mr\\.", "Master", "Don", "Rev", "Dr\\.", "Major", "Sir", "Col", "Capt", "Jonkheer"))
is_man = sapply(titles, grepl, titanic$Name)
sum(is_man)
## [1] 578
# 호칭을 제외한 순수 이름만 추출.
convert_name <- function(name) {
if (grepl("\\(.*?\\)", name)) {
# 괄호 안의 이름을 가져옴
gsub("^.*?\\((.*?)\\)$", "\\1", name)
} else {
# 성과 이름의 순서를 바꿈
gsub("^(.*?),\\s[a-zA-Z\\.]*?\\s(.*?)$", "\\2 \\1", name)
}
}
clean_names = vapply(titanic$Name, FUN = convert_name, FUN.VALUE = character(1), USE.NAMES = FALSE)
head(clean_names)
## [1] "Owen Harris Braund" "Florence Briggs Thayer"
## [3] "Laina Heikkinen" "Lily May Peel"
## [5] "William Henry Allen" "James Moran"
head(titanic$Name)
## [1] "Braund, Mr. Owen Harris"
## [2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
## [3] "Heikkinen, Miss. Laina"
## [4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"
## [5] "Allen, Mr. William Henry"
## [6] "Moran, Mr. James"