R Basics & 정규표현식

# 기초 문법

# NA

a <- c(1,2,NA)
sum(a)

## [1] NA

sum(a, na.rm = T)

## [1] 3

# NA : Not Available. logical type.
# NaN : Not a Number 정의되지 않은 숫자. double type.

is.na(NA)                       # TRUE

## [1] TRUE

is.na(NaN)                      # TRUE

## [1] TRUE

is.na(c("NA", NA, NaN))         # FALSE TRUE FALSE / 처음값 때문에 NaN이 문자형으로 형변환.

## [1] FALSE  TRUE FALSE

is.na(c(NaN, NA))               # TRUE TRUE

## [1] TRUE TRUE

# factor : 문자 --> 숫자. 그래프 또는 통계처리에 유리.

a <- c("서울","부산","제주","제주","서울","대전","부산","서울")
fa <- factor(a)
fa

## [1] 서울 부산 제주 제주 서울 대전 부산 서울
## Levels: 대전 부산 서울 제주

mode(fa)

## [1] "numeric"

plot(fa)

# Date

Sys.Date()

## [1] "2017-03-16"

substr(Sys.Date(), 1, 4)

## [1] "2017"

startDate <- as.Date("2016-03-01")
endDate <- as.Date("2016-04-09")
endDate - startDate

## Time difference of 39 days

as.numeric(endDate - startDate)

## [1] 39

as.Date("05072016", format="%d%m%Y")

## [1] "2016-07-05"

as.Date(30, "2016-05-01")

## [1] "2016-05-31"

as.Date(-30, "2016-05-01")

## [1] "2016-04-01"

# 내장 함수

ceiling(1.6)

## [1] 2

floor(1.6)

## [1] 1

exp(2)           # e^2

## [1] 7.389056

factorial(5)

## [1] 120

sqrt(4)          # root

## [1] 2

x <- c(1:10)
max(x)

## [1] 10

min(x)

## [1] 1

length(x)

## [1] 10

mean(x)          # 평균

## [1] 5.5

median(x)        # 중앙값

## [1] 5.5

sd(x)            # 표준편차

## [1] 3.02765

rev(x)

##  [1] 10  9  8  7  6  5  4  3  2  1

# 집합 연산

a <- c(1,2,3)
b <- c(2,3,4)
a + b

## [1] 3 5 7

c <- c("3", "4", "5")

union(a, c)

## [1] "1" "2" "3" "4" "5"

setdiff(a,b)

## [1] 1

setdiff(b,a)

## [1] 4

intersect(a,b)

## [1] 2 3

# Vector

# vector의 datatype은 모두 동일해야 한다.
# 다를 경우 자동으로 하나의 데이터타입으로 처리됨.
a <- c(1, 2, "3")         
a

## [1] "1" "2" "3"

names(a) <- c("x1", "x2", "x3")   # vector 각 컬럼에 이름 지정 가능
a

##  x1  x2  x3 
## "1" "2" "3"

a["x1"]

##  x1 
## "1"

length(a)

## [1] 3

nrow(a)        # 행렬에만 사용되는 함수

## NULL

NROW(a)

## [1] 3

seq(1, 10)

##  [1]  1  2  3  4  5  6  7  8  9 10

seq(5, -5)

##  [1]  5  4  3  2  1  0 -1 -2 -3 -4 -5

seq(-5, 5, 2)   # 2씩 증가

## [1] -5 -3 -1  1  3  5

rep(1:5, 2)         # 반복

##  [1] 1 2 3 4 5 1 2 3 4 5

rep(1:5, 2, each=2) # 연속해서 반복

##  [1] 1 1 2 2 3 3 4 4 5 5 1 1 2 2 3 3 4 4 5 5

1 %in% a       # vector에 특정 문자 포함 여부

## [1] TRUE

# vector indexing
a <- c(1,2,3,4,5,6)
a[c(1,3)]

## [1] 1 3

a[-1:-3]

## [1] 4 5 6

a[-length(a)]

## [1] 1 2 3 4 5

# 행렬 matrix

x <- matrix(c(1,2,3,4))
x <- matrix(c(1,2,3,4), nrow=2)
x <- matrix(c(1,2,3,4), nrow=2, byrow = T)
x

##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4

x[1,]      # 행 값들 가져오기

## [1] 1 2

x[,1]      # 열 값들 가져오기

## [1] 1 3

x <- matrix(c(1,2,3,4), nrow=2)
x <- rbind(x, c(77, 88))       # 행 추가
x <- cbind(x, c(9, 8, 7))      # 열 추가
x

##      [,1] [,2] [,3]
## [1,]    1    3    9
## [2,]    2    4    8
## [3,]   77   88    7

colnames(x) <- c("no1", "no2", "no3")
x

##      no1 no2 no3
## [1,]   1   3   9
## [2,]   2   4   8
## [3,]  77  88   7

# list

member <- list(name="Kate", address="Seoul", tall=170, pay=10000)
member

## $name
## [1] "Kate"
## 
## $address
## [1] "Seoul"
## 
## $tall
## [1] 170
## 
## $pay
## [1] 10000

member$name

## [1] "Kate"

member[2:3]

## $address
## [1] "Seoul"
## 
## $tall
## [1] 170

member$birth <- "1990-05-07"      # list에 항목 추가
member$birth <- NULL              # list 항목 삭제

length(member)                    # 항목 갯수

## [1] 4

# dataframe

# Create Dataframe

# 1. 직접 생성

no <- c(1:3)
name <- c("xx", "yy", "zz")
price <- c(500,100,300)
qty <- c(2,7,9)

item <- data.frame(NO=no, Name=name, Price=price, QTY=qty)
item

##   NO Name Price QTY
## 1  1   xx   500   2
## 2  2   yy   100   7
## 3  3   zz   300   9

# 2. from Matrix

x <- c(1,"James",300, 2,"Kate",500, 3,"Risa",700, 4,"Liz",900)
data <- matrix(x, 4, byrow = T)
data

##      [,1] [,2]    [,3] 
## [1,] "1"  "James" "300"
## [2,] "2"  "Kate"  "500"
## [3,] "3"  "Risa"  "700"
## [4,] "4"  "Liz"   "900"

member <- data.frame(data)
names(member) <- c("NO", "NAME", "PAY")
member

##   NO  NAME PAY
## 1  1 James 300
## 2  2  Kate 500
## 3  3  Risa 700
## 4  4   Liz 900

# 3. from Text file

fruitData <- read.table("data/fruits.txt", header = T, sep = "")
fruitData

##   no   name price qty
## 1  1  apple   500   5
## 2  2 orange   300   3
## 3  3  peach   200   7
## 4  4  berry   100  10

fruitData <- read.table("data/fruits.txt", header = T, sep = "", skip = 3)      # ignore 3 row
fruitData <- read.table("data/fruits.txt", header = T, sep = "", nrows = 3)     # return first 3 row


# 4. from csv or Excel file

cnames <- c("no", "name", "price1", "price2", "qty")
df <- read.csv("data/fruits.csv", header = F, col.names = cnames)  # 컬럼명 지정해서 import
df

##   no   name price1 price2 qty
## 1  1  apple    500    700   5
## 2  2 orange    300    550   3
## 3  3  peach    200    400   7
## 4  4  berry    100    180  10

library("readxl")
df <- read_excel(path = "data/fruits.xlsx", sheet = "Sheet1", col_names = TRUE)
df

##   no   name lowprice highprice qty
## 1  1  apple      500       700   5
## 2  2 orange      600       800   3
## 3  3  peach      200       400   7
## 4  4  berry      100       200  10

# save to xml

# library("XML")
# library("kulife")
# write.xml(df, file="fruits.xml")

# Dataframe 다루기

df <- read_excel(path = "data/fruits.xlsx", sheet = "Sheet1", col_names = TRUE)
df

##   no   name lowprice highprice qty
## 1  1  apple      500       700   5
## 2  2 orange      600       800   3
## 3  3  peach      200       400   7
## 4  4  berry      100       200  10

summary(df)

##        no           name              lowprice     highprice  
##  Min.   :1.00   Length:4           Min.   :100   Min.   :200  
##  1st Qu.:1.75   Class :character   1st Qu.:175   1st Qu.:350  
##  Median :2.50   Mode  :character   Median :350   Median :550  
##  Mean   :2.50                      Mean   :350   Mean   :525  
##  3rd Qu.:3.25                      3rd Qu.:525   3rd Qu.:725  
##  Max.   :4.00                      Max.   :600   Max.   :800  
##       qty       
##  Min.   : 3.00  
##  1st Qu.: 4.50  
##  Median : 6.00  
##  Mean   : 6.25  
##  3rd Qu.: 7.75  
##  Max.   :10.00

str(df)

## Classes 'tbl_df', 'tbl' and 'data.frame':    4 obs. of  5 variables:
##  $ no       : num  1 2 3 4
##  $ name     : chr  "apple" "orange" "peach" "berry"
##  $ lowprice : num  500 600 200 100
##  $ highprice: num  700 800 400 200
##  $ qty      : num  5 3 7 10

df[2,2]

## [1] "orange"

df$lowprice

## [1] 500 600 200 100

df[c(2,3)]                    # column 1,2

##     name lowprice
## 1  apple      500
## 2 orange      600
## 3  peach      200
## 4  berry      100

df[c(2,3),]                   # row 1,2

##   no   name lowprice highprice qty
## 2  2 orange      600       800   3
## 3  3  peach      200       400   7

ncol(df)

## [1] 5

nrow(df)

## [1] 4

names(df)

## [1] "no"        "name"      "lowprice"  "highprice" "qty"

rownames(df)

## [1] "1" "2" "3" "4"

# sort / order / rank

sort(df$highprice)

## [1] 200 400 700 800

sort(df$highprice, decreasing = T)

## [1] 800 700 400 200

order(df$highprice)           # 정렬된 위치 인덱스 리턴

## [1] 4 3 1 2

rank(df$highprice)            # 정렬되지 않은 위치 인덱스 리턴

## [1] 3 4 2 1

# split

split(df, df$name)            # 해당 컬럼 기준으로 분할

## $apple
##   no  name lowprice highprice qty
## 1  1 apple      500       700   5
## 
## $berry
##   no  name lowprice highprice qty
## 4  4 berry      100       200  10
## 
## $orange
##   no   name lowprice highprice qty
## 2  2 orange      600       800   3
## 
## $peach
##   no  name lowprice highprice qty
## 3  3 peach      200       400   7

split(df, df$no > 2)

## $`FALSE`
##   no   name lowprice highprice qty
## 1  1  apple      500       700   5
## 2  2 orange      600       800   3
## 
## $`TRUE`
##   no  name lowprice highprice qty
## 3  3 peach      200       400   7
## 4  4 berry      100       200  10

# merge

x <- data.frame( names=c("A", "B", "C"), address=c("Seoul", "Busan", "Tyokyo"))
y <- data.frame( names=c("A", "B", "D"), telno=c("001", "003", "888"))

merge(x, y)      # 공통적으로 있는 데이터만 merge

##   names address telno
## 1     A   Seoul   001
## 2     B   Busan   003

merge(x, y, by = "names")

##   names address telno
## 1     A   Seoul   001
## 2     B   Busan   003

merge(x, y, by = "names", all = T)

##   names address telno
## 1     A   Seoul   001
## 2     B   Busan   003
## 3     C  Tyokyo  <NA>
## 4     D    <NA>   888

# subset : dataframe 에서 조건에 맞는 데이터를를 dataframe으로 추출

subset(df, df$qty > 5)

##   no  name lowprice highprice qty
## 3  3 peach      200       400   7
## 4  4 berry      100       200  10

subset(df, highprice > 500)

##   no   name lowprice highprice qty
## 1  1  apple      500       700   5
## 2  2 orange      600       800   3

subset(df, name == "apple")

##   no  name lowprice highprice qty
## 1  1 apple      500       700   5

subset(df, select = c(name,qty), subset = df$qty > 5)

##    name qty
## 3 peach   7
## 4 berry  10

subset(df, select = -no)

##     name lowprice highprice qty
## 1  apple      500       700   5
## 2 orange      600       800   3
## 3  peach      200       400   7
## 4  berry      100       200  10

library(MASS)
str(Cars93)

## 'data.frame':    93 obs. of  27 variables:
##  $ Manufacturer      : Factor w/ 32 levels "Acura","Audi",..: 1 1 2 2 3 4 4 4 4 5 ...
##  $ Model             : Factor w/ 93 levels "100","190E","240",..: 49 56 9 1 6 24 54 74 73 35 ...
##  $ Type              : Factor w/ 6 levels "Compact","Large",..: 4 3 1 3 3 3 2 2 3 2 ...
##  $ Min.Price         : num  12.9 29.2 25.9 30.8 23.7 14.2 19.9 22.6 26.3 33 ...
##  $ Price             : num  15.9 33.9 29.1 37.7 30 15.7 20.8 23.7 26.3 34.7 ...
##  $ Max.Price         : num  18.8 38.7 32.3 44.6 36.2 17.3 21.7 24.9 26.3 36.3 ...
##  $ MPG.city          : int  25 18 20 19 22 22 19 16 19 16 ...
##  $ MPG.highway       : int  31 25 26 26 30 31 28 25 27 25 ...
##  $ AirBags           : Factor w/ 3 levels "Driver & Passenger",..: 3 1 2 1 2 2 2 2 2 2 ...
##  $ DriveTrain        : Factor w/ 3 levels "4WD","Front",..: 2 2 2 2 3 2 2 3 2 2 ...
##  $ Cylinders         : Factor w/ 6 levels "3","4","5","6",..: 2 4 4 4 2 2 4 4 4 5 ...
##  $ EngineSize        : num  1.8 3.2 2.8 2.8 3.5 2.2 3.8 5.7 3.8 4.9 ...
##  $ Horsepower        : int  140 200 172 172 208 110 170 180 170 200 ...
##  $ RPM               : int  6300 5500 5500 5500 5700 5200 4800 4000 4800 4100 ...
##  $ Rev.per.mile      : int  2890 2335 2280 2535 2545 2565 1570 1320 1690 1510 ...
##  $ Man.trans.avail   : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 1 1 1 1 ...
##  $ Fuel.tank.capacity: num  13.2 18 16.9 21.1 21.1 16.4 18 23 18.8 18 ...
##  $ Passengers        : int  5 5 5 6 4 6 6 6 5 6 ...
##  $ Length            : int  177 195 180 193 186 189 200 216 198 206 ...
##  $ Wheelbase         : int  102 115 102 106 109 105 111 116 108 114 ...
##  $ Width             : int  68 71 67 70 69 69 74 78 73 73 ...
##  $ Turn.circle       : int  37 38 37 37 39 41 42 45 41 43 ...
##  $ Rear.seat.room    : num  26.5 30 28 31 27 28 30.5 30.5 26.5 35 ...
##  $ Luggage.room      : int  11 15 14 17 13 16 17 21 14 18 ...
##  $ Weight            : int  2705 3560 3375 3405 3640 2880 3470 4105 3495 3620 ...
##  $ Origin            : Factor w/ 2 levels "USA","non-USA": 2 2 2 2 2 1 1 1 1 1 ...
##  $ Make              : Factor w/ 93 levels "Acura Integra",..: 1 2 4 3 5 6 7 9 8 10 ...

subset(Cars93, select = c(Model, Type, Price), MPG.city > 30)

##      Model  Type Price
## 31 Festiva Small   7.4
## 39   Metro Small   8.4
## 42   Civic Small  12.1
## 73  LeMans Small   9.0
## 80   Justy Small   8.4
## 83   Swift Small   8.6
## 84  Tercel Small   9.8

subset(Cars93, select = c(Manufacturer, Model, Type, Price, Make), 
       MPG.highway > median(MPG.highway) & Manufacturer == "Subaru")

##    Manufacturer  Model    Type Price          Make
## 80       Subaru  Justy   Small   8.4  Subaru Justy
## 81       Subaru Loyale   Small  10.9 Subaru Loyale
## 82       Subaru Legacy Compact  19.5 Subaru Legacy

# Dataframe 내용 저장.

write.table(df, "data/save_fruits.txt", quote = F, append = F)

# apply family

# apply, lapply, sapply, by, tapply, aggregate

# apply     : 결과를 vector로 반환 (1 행 / 2 열)
# lapply    : 결과를 list로 반환
# sapply    : 결과를 vector 또는 matrix로 반환

s1 <- c(91, 87, 95, 96, 89, 87, 86, 85, 92, 93)
s2 <- c(89, 86, 85, 92, 93, 91, 90, 95, 87, 89)
s3 <- c(89, 86, 78, 99, 95, 87, 89, 86, 85, 92)


# list
score <- list(korean = s1, english = s2, math = s3)
score

## $korean
##  [1] 91 87 95 96 89 87 86 85 92 93
## 
## $english
##  [1] 89 86 85 92 93 91 90 95 87 89
## 
## $math
##  [1] 89 86 78 99 95 87 89 86 85 92

lapply(score, mean)   # ---> list

## $korean
## [1] 90.1
## 
## $english
## [1] 89.7
## 
## $math
## [1] 88.6

sapply(score, mean)   # ---> vector

##  korean english    math 
##    90.1    89.7    88.6

sapply(score, range)

##      korean english math
## [1,]     85      85   78
## [2,]     96      95   99

sapply(score, t.test)

##             korean              english             math               
## statistic   73.5936             89.65021            48.33526           
## parameter   9                   9                   9                  
## p.value     7.986569e-14        1.354942e-14        3.480671e-12       
## conf.int    Numeric,2           Numeric,2           Numeric,2          
## estimate    90.1                89.7                88.6               
## null.value  0                   0                   0                  
## alternative "two.sided"         "two.sided"         "two.sided"        
## method      "One Sample t-test" "One Sample t-test" "One Sample t-test"
## data.name   "X[[i]]"            "X[[i]]"            "X[[i]]"

extremes <- function(x) {
    c(min = min(x), max = max(x))
}

sapply(score, extremes)

##     korean english math
## min     85      85   78
## max     96      95   99

# matrix
score <- c(s1, s2, s3)
dim(score) <- c(3, 10)
colnames(score) <- c("t1","t2","t3","t4","t5","t6","t7","t8","t9","t10")
rownames(score) <- c("K", "L", "M")
score

##   t1 t2 t3 t4 t5 t6 t7 t8 t9 t10
## K 91 96 86 93 85 91 87 86 95  86
## L 87 89 85 89 92 90 89 78 87  85
## M 95 87 92 86 93 95 89 99 89  92

apply(score, 1, mean)

##    K    L    M 
## 89.6 87.1 91.7

apply(score, 2, max)

##  t1  t2  t3  t4  t5  t6  t7  t8  t9 t10 
##  95  96  92  93  93  95  89  99  95  92

# dataframe
df2 <- data.frame(score=c(s1, s2, s3))
df2$name <- c("K", "L", "M")
df2

##    score name
## 1     91    K
## 2     87    L
## 3     95    M
## 4     96    K
## 5     89    L
## 6     87    M
## 7     86    K
## 8     85    L
## 9     92    M
## 10    93    K
## 11    89    L
## 12    86    M
## 13    85    K
## 14    92    L
## 15    93    M
## 16    91    K
## 17    90    L
## 18    95    M
## 19    87    K
## 20    89    L
## 21    89    M
## 22    86    K
## 23    78    L
## 24    99    M
## 25    95    K
## 26    87    L
## 27    89    M
## 28    86    K
## 29    85    L
## 30    92    M

apply(df2, 1, mean)   # error : 각 열의 데이터타입이 다름

##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [24] NA NA NA NA NA NA NA

by(df2$score, df2$name, mean)

## df2$name: K
## [1] 89.6
## -------------------------------------------------------- 
## df2$name: L
## [1] 87.1
## -------------------------------------------------------- 
## df2$name: M
## [1] 91.7

tapply(df2$score, df2$name, mean)

##    K    L    M 
## 89.6 87.1 91.7

aggregate(score ~ name, data = df2, mean)

##   name score
## 1    K  89.6
## 2    L  87.1
## 3    M  91.7

# apply examples

# 1
pioneers <- c("GAUSS:1777", "BAYES:1702", "PASCAL:1623", "PEARSON:1857")
split <- strsplit(pioneers, split = ":")
split

## [[1]]
## [1] "GAUSS" "1777" 
## 
## [[2]]
## [1] "BAYES" "1702" 
## 
## [[3]]
## [1] "PASCAL" "1623"  
## 
## [[4]]
## [1] "PEARSON" "1857"

names <- lapply(split, function(x) { x[1] })
names

## [[1]]
## [1] "GAUSS"
## 
## [[2]]
## [1] "BAYES"
## 
## [[3]]
## [1] "PASCAL"
## 
## [[4]]
## [1] "PEARSON"

# 2
select_el <- function(x, index) {
    x[index]
}

years <- lapply(split, select_el, index = 2)
years

## [[1]]
## [1] "1777"
## 
## [[2]]
## [1] "1702"
## 
## [[3]]
## [1] "1623"
## 
## [[4]]
## [1] "1857"

# 입출력

scan("data/fruits.txt", what = "")

##  [1] "no"     "name"   "price"  "qty"    "1"      "apple"  "500"   
##  [8] "5"      "2"      "orange" "300"    "3"      "3"      "peach" 
## [15] "200"    "7"      "4"      "berry"  "100"    "10"

readLines("data/fruits.txt")

## [1] "no\tname\tprice\tqty" "1\tapple\t500\t5"     "2\torange\t300\t3"   
## [4] "3\tpeach\t200\t7"     "4\tberry\t100\t10"

read.table("data/fruits.txt")

##   V1     V2    V3  V4
## 1 no   name price qty
## 2  1  apple   500   5
## 3  2 orange   300   3
## 4  3  peach   200   7
## 5  4  berry   100  10

read.table("data/fruits.txt", header = T)

##   no   name price qty
## 1  1  apple   500   5
## 2  2 orange   300   3
## 3  3  peach   200   7
## 4  4  berry   100  10

# 문자열 합치기, 나누기, Replacement

paste("abc", "xyz")

## [1] "abc xyz"

paste("abc", "xyz", sep = ":")

## [1] "abc:xyz"

substr("123456789", 3, 5)                  # 시작위치, 끝나는 위치

## [1] "345"

substr("123456789", -2, 2)

## [1] "12"

strsplit("2016-04-19", split = "-")

## [[1]]
## [1] "2016" "04"   "19"

d <- readLines("data/alert_log.txt")
d <- gsub(" ", "_", d)                     # 특정 문자열 치환. gsub
length(d)

## [1] 15908

c <- subset(d, nchar(d) > 100)             # nchar : 문자열 길이
length(c)

## [1] 42

# 정규표현식

regexpr("ORACLE", c)

##  [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [24] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## attr(,"match.length")
##  [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [24] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## attr(,"useBytes")
## [1] TRUE

# \\d       숫자
# \\D       숫자 아닌 것
# 
# \\s       공백
# \\S       공백 아닌 것
# 
# \\t       Tab
# \\n       new line (enter)
# \\.       dot
# 
# ^         시작 글자
# $         마지막 글자
# .*        any character
# 
# [ab]      a 또는 b
# [^ab]     a와 b 제외한 모든 문자
# [0-9]     모든 숫자
# [A-Z]     영어 대문자
# [a-z]     영어 소문자
# [A-z]     모든 영문자
# 
# i+        i가 최소 1회 이상 나오는 경우
# i*        i가 최소 0회 이상 나오는 경우
# i?        i가 최소 0회에서 최대 1회만 나오는 경우
# i{n}      i가 연속적으로 n회 나오는 경우
# i{n1,n2}  i가 n1에서 n2회 나오는 경우
# i{n,}     i가 n회 이상 나오는 경우
# 
# [:alnum:] 문자와 숫자가 나오는 경우
# [:alpha:] 문자
# [:blank:] 공백
# [:cntrl:] 제어 문자
# [:digit:] 0 ~ 9
# [:lower:] 소문자
# [:print:] 숫자, 문자, 특수문자, 공백 모두
# [:punct:] 특수문자
# [:space:] 공백문자
# [:upper:] 대문자
# [:xdigit:]    16진수

grep("ORACLE", d)                # vector에서 특정 패턴을 찾아 index 출력

##   [1]     2    69   141   208   455   521   697   762   944  1009  1197
##  [12]  1262  1354  1419  1490  1555  1625  1690  1752  1818  1908  1973
##  [23]  2059  2124  2239  2307  2421  2489  2636  2704  2890  2958  3058
##  [34]  3126  3773  3841  3957  4025  4136  4204  4281  4297  4356  4424
##  [45]  4523  4591  4634  4702  4770  4838  4898  4930  4952  5020  5288
##  [56]  5356  5473  5541  5618  5686  5830  5898  6008  6076  6158  6226
##  [67]  6329  6373  6441  6528  6596  6706  6774  6885  6953  7067  7133
##  [78]  7349  7415  7540  7608  7617  7638  7706  7851  7919  8001  8047
##  [89]  8115  8669  8684 11993 12061 12257 12325 12510 12578 12695 12761
## [100] 12989 13057 13170 13238 13360 13426 13644 13712 13827 13893 14118
## [111] 14186 14378 14446 14505 14573 14706 14774 14944 15012 15499 15567
## [122] 15829

grep("ORACLE_instance", d, value = T)     # 값 출력

##  [1] "Starting_ORACLE_instance_(normal)"  
##  [2] "Starting_ORACLE_instance_(normal)"  
##  [3] "Starting_ORACLE_instance_(restrict)"
##  [4] "Starting_ORACLE_instance_(normal)"  
##  [5] "Starting_ORACLE_instance_(normal)"  
##  [6] "Starting_ORACLE_instance_(normal)"  
##  [7] "Starting_ORACLE_instance_(normal)"  
##  [8] "Starting_ORACLE_instance_(normal)"  
##  [9] "Starting_ORACLE_instance_(normal)"  
## [10] "Starting_ORACLE_instance_(normal)"  
## [11] "Starting_ORACLE_instance_(normal)"  
## [12] "Starting_ORACLE_instance_(normal)"  
## [13] "Starting_ORACLE_instance_(normal)"  
## [14] "Starting_ORACLE_instance_(normal)"  
## [15] "Starting_ORACLE_instance_(normal)"  
## [16] "Starting_ORACLE_instance_(normal)"  
## [17] "Starting_ORACLE_instance_(normal)"  
## [18] "Starting_ORACLE_instance_(normal)"  
## [19] "Starting_ORACLE_instance_(normal)"  
## [20] "Starting_ORACLE_instance_(normal)"  
## [21] "Starting_ORACLE_instance_(normal)"  
## [22] "Starting_ORACLE_instance_(normal)"  
## [23] "Starting_ORACLE_instance_(normal)"  
## [24] "Starting_ORACLE_instance_(normal)"  
## [25] "Starting_ORACLE_instance_(normal)"  
## [26] "Starting_ORACLE_instance_(normal)"  
## [27] "Starting_ORACLE_instance_(normal)"  
## [28] "Starting_ORACLE_instance_(normal)"  
## [29] "Starting_ORACLE_instance_(normal)"  
## [30] "Starting_ORACLE_instance_(normal)"  
## [31] "Starting_ORACLE_instance_(normal)"  
## [32] "Starting_ORACLE_instance_(normal)"  
## [33] "Starting_ORACLE_instance_(normal)"  
## [34] "Starting_ORACLE_instance_(normal)"  
## [35] "Starting_ORACLE_instance_(normal)"  
## [36] "Starting_ORACLE_instance_(normal)"  
## [37] "Starting_ORACLE_instance_(normal)"  
## [38] "Starting_ORACLE_instance_(normal)"  
## [39] "Starting_ORACLE_instance_(normal)"  
## [40] "Starting_ORACLE_instance_(normal)"  
## [41] "Starting_ORACLE_instance_(normal)"  
## [42] "Starting_ORACLE_instance_(normal)"  
## [43] "Starting_ORACLE_instance_(normal)"  
## [44] "Starting_ORACLE_instance_(normal)"  
## [45] "Starting_ORACLE_instance_(normal)"  
## [46] "Starting_ORACLE_instance_(normal)"  
## [47] "Starting_ORACLE_instance_(normal)"  
## [48] "Starting_ORACLE_instance_(normal)"  
## [49] "Starting_ORACLE_instance_(normal)"  
## [50] "Starting_ORACLE_instance_(normal)"  
## [51] "Starting_ORACLE_instance_(normal)"  
## [52] "Starting_ORACLE_instance_(normal)"  
## [53] "Starting_ORACLE_instance_(normal)"  
## [54] "Starting_ORACLE_instance_(normal)"  
## [55] "Starting_ORACLE_instance_(normal)"  
## [56] "Starting_ORACLE_instance_(normal)"  
## [57] "Starting_ORACLE_instance_(normal)"  
## [58] "Starting_ORACLE_instance_(normal)"  
## [59] "Starting_ORACLE_instance_(normal)"  
## [60] "Starting_ORACLE_instance_(normal)"  
## [61] "Starting_ORACLE_instance_(normal)"

grep("^Setting", d)              # ^ : 첫글자

##  [1]   322   808   809   897   898  1068  1069  1089  1090  1142  1143
## [12]  1606  1607  2012  2013  2160  2161  2181  2182  3264  3265  5154
## [23]  7223  8216  8217 12131 12132 12895 13547 13994 15118 15119 15195
## [34] 15196 15701 15702 15867 15868

grep("ing$", d)                  # $ : 마지막 글자

##  [1]   334  2369  2373  4283  4299  5173  7253  8671  8686 12906 13558
## [12] 14023

emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org", 
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")

grepl("@.*\\.edu$", emails)     # logical

## [1]  TRUE FALSE FALSE FALSE  TRUE FALSE

grep("@.*\\.edu$", emails)      # index

## [1] 1 5

hits = grep("@.*\\.edu$", emails)   # @ 다음에 .edu로 끝나는 것.
emails[hits]

## [1] "john.doe@ivyleague.edu"   "quant@bigdatacollege.edu"

gsub(pattern = "@.*\\.edu$", replacement = "@datacamp.edu", emails)

## [1] "john.doe@datacamp.edu"    "education@world.gov"     
## [3] "dalai.lama@peace.org"     "invalid.edu"             
## [5] "quant@datacamp.edu"       "cookie.monster@sesame.tv"

R Basics & 정규표현식

woosa7