[문제216] 문재인 대통령의취임사 감성분석하기
스크롤링
library(rvest)
html <- read_html("https://www1.president.go.kr/articles/517")
text <- html_nodes(html,xpath='//*[@id="cont_view"]/div/div/div[1]/div[3]/div')%>%html_text(trim=T)
text <- text[text != ""]
text <- paste(text,collapse = ' ')
KoNLP 설치
#1. JAVA_HOME 위치확인
Sys.getenv('JAVA_HOME')
"C:/Program Files/Java/jdk1.8.0_271"
#2.rJAVA 설치
Sys.setenv(JAVA_HOME="C:/Program Files/Java/jdk1.8.0_271")
install.packages('rJava')
library(rJava)
#3.konlp 프로그램과 관련있는 프로그램을 설치
install.packages(c('stringr', 'hash', 'tau', 'Sejong', 'RSQLite', 'devtools'), type = "binary")
install.packages('remotes')
#4. konlp 설치
remotes::install_github('haven-jeon/KoNLP', upgrade = "never", INSTALL_opts=c("--no-multiarch"))
library(KoNLP) #설치오류(first~) : openNLP, NLP 지우고 KoNLP 설치 또는 그냥 껐다 키기
useSejongDic()
useNIADic()
+) stringr 오류날 때
Sys.setlocale("LC_CTYPE", ".1251") #[가-힣] 오류났을 때
Sys.setlocale("LC_ALL") #한글 깨질 때
word 분류
extractNoun(text)
pos <- SimplePos09(text)
str_match(pos,'([가-힣]+)/N')[,2]
#[가-힣] 오류날 때 : str_match(pos,'([\uac00-\ud7a3]+)/N')[,2]
as.vector(na.omit(str_match(pos,'([가-힣]+)/N')[,2]))
#[가-힣] 오류날 때 : as.vector(na.omit(substr(pos,1,str_locate(pos,'/N')-1)))
nn.words <- function(doc){
doc <- as.character(doc)
pos <- SimplePos09(doc)
as.vector(na.omit(str_match(pos,'([가-힣]+)/N')[,2]))
#[가-힣] 오류날 때 : as.vector(na.omit(substr(pos,1,str_locate(pos,'/N')-1)))
}
word <- nn.words(text)
전처리
#두글자 이상인 단어만 추출
word[nchar(word)>=2]
word <- Filter(function(x) nchar(x) >=2,word)
#불용어 단어 제외(불용어 사전 만들기)
grep('대통령',word,value=T)
grep('문재인',word,value=T)
stop_words <- c("문재인","대통령","문재인정부","대통령의","대통령선거")
word_new <- c()
for(i in word){
if(! i %in% stop_words){
word_new <- c(word_new,i)
}
}
시각화
library(wordcloud2)
wordcloud2(table(word_new))
명사를 기준으로 말뭉치 생성
word_text <- paste(word_new,collapse = ' ')
install.packages("tm")
library(tm)
corpus <- VCorpus(VectorSource(word_text))
lapply(corpus, content)
corpus_dtm <- DocumentTermMatrix(corpus)
inspect(corpus_dtm)
termfreq <- colSums(as.matrix(corpus_dtm))
freq_df <- data.frame(termfreq)
head(freq_df)
freq_df$word <- rownames(freq_df)
rownames(freq_df) <- NULL
head(freq_df)
freq_df <- freq_df[,c(2,1)]
wordcloud2(freq_df)
Document Term Matrix 생성
moon_corpus <- VCorpus(VectorSource(text))
moon_corpus[[1]]$content
lapply(moon_corpus,content)
moon_dtm <- DocumentTermMatrix(moon_corpus,
control = list(tokenize=nn.words,
wordLengths=c(2,Inf),
stopwords=stop_words,
removeNumbers=TRUE,
removePunctuation=TRUE))
head(sort(colSums(as.matrix(moon_dtm)),decreasing = T))
Terms(moon_dtm)
moon_termfreq <- colSums(as.matrix(moon_dtm))
moon_df <- data.frame(word=names(moon_termfreq),freq=moon_termfreq)
head(moon_df)
library(wordcloud2)
wordcloud2(moon_df)
군산대 감성사전으로 감성분석
Sys.setlocale("LC_ALL") #한글 깨질 때
#긍정감성, 부정감성 가져오기
pos <- read.csv("c:/data/pos_pol_word.txt",encoding = 'UTF-8',header = F)
neg <- read.csv("c:/data/neg_pol_word.txt",encoding = 'UTF-8',header = F)
names(pos) <- "word"
names(neg) <- "word"
pos$sentiment <- '긍정'
neg$sentiment <- '부정'
k_sentiment_dic <- rbind(pos,neg)
head(k_sentiment_dic)
tail(k_sentiment_dic)
write.csv(k_sentiment_dic,file="c:/data/k_sentiment_dic.txt")
moon_sentiment <- merge(moon_df,k_sentiment_dic,by='word')
head(moon_sentiment)
aggregate(word~sentiment,moon_sentiment,length)
+)단어를 기반으로
moon_word_dtm <- DocumentTermMatrix(moon_corpus)
inspect(moon_word_dtm)
moon_wordfreq <- colSums(as.matrix(moon_word_dtm))
moon_word_df <- data.frame(word=names(moon_wordfreq),freq=moon_wordfreq)
moon_word_sentiment <- merge(moon_word_df,k_sentiment_dic,by='word')
#k_sentiment_dic : 위에서 저장한 감성사전
head(moon_word_sentiment)
aggregate(word~sentiment,moon_word_sentiment,length)
'R' 카테고리의 다른 글
[R] 머신러닝 - Naive Bayes로 학습 후 분류 (0) | 2022.02.18 |
---|---|
[R] ngram, 토근화 (0) | 2022.02.18 |
[R] 자연어 처리 - NLP (0) | 2022.02.16 |
[R] 감성분석 (0) | 2022.02.16 |
[R] text mining (0) | 2022.02.15 |