각 문서마다 20개의 토픽이 들어있는 비율. Term-Document matrix
load("data/smpl_lda_df.RData")
df <- smpl_lda_df
head(df)
## Topic_1 Topic_2 Topic_3 Topic_4 Topic_5 Topic_6 Topic_7 Topic_8
## Doc_1 0 0.0000000 0 0 0 0 0 1.0000000
## Doc_2 0 0.0000000 0 0 0 0 0 0.0000000
## Doc_3 0 0.0000000 0 0 0 0 0 0.0000000
## Doc_4 0 0.0000000 0 0 0 0 0 0.3333333
## Doc_5 0 0.0000000 0 0 0 0 0 0.0000000
## Doc_6 0 0.6666667 0 0 0 0 0 0.0000000
## Topic_9 Topic_10 Topic_11 Topic_12 Topic_13 Topic_14 Topic_15
## Doc_1 0 0.0000000 0 0.0000000 0 0 0
## Doc_2 0 0.0000000 0 0.0000000 0 1 0
## Doc_3 1 0.0000000 0 0.0000000 0 0 0
## Doc_4 0 0.3333333 0 0.0000000 0 0 0
## Doc_5 1 0.0000000 0 0.0000000 0 0 0
## Doc_6 0 0.0000000 0 0.3333333 0 0 0
## Topic_16 Topic_17 Topic_18 Topic_19 Topic_20
## Doc_1 0 0 0 0.0000000 0
## Doc_2 0 0 0 0.0000000 0
## Doc_3 0 0 0 0.0000000 0
## Doc_4 0 0 0 0.3333333 0
## Doc_5 0 0 0 0.0000000 0
## Doc_6 0 0 0 0.0000000 0
dim(df)
## [1] 143 20
fCountTopic <- function(x) {
tPositions <- which(x>0)
return( length(tPositions) )
}
fTopicNames <- function(x) {
tPositions <- which(x>0)
tNames <- rownames(as.matrix(tPositions))
return( paste(tNames, collapse=" ") )
}
df$Ntopic <- apply(df[,1:20], 1, fCountTopic)
df$Tname <- apply(df[,1:20], 1, fTopicNames)
answer <- df[21:22]
head(answer)
## Ntopic Tname
## Doc_1 1 Topic_8
## Doc_2 1 Topic_14
## Doc_3 1 Topic_9
## Doc_4 3 Topic_8 Topic_10 Topic_19
## Doc_5 1 Topic_9
## Doc_6 2 Topic_2 Topic_12