-
Notifications
You must be signed in to change notification settings - Fork 0
/
gutenberg_7.r
65 lines (55 loc) · 1.93 KB
/
gutenberg_7.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
library("gutenbergr")
library(tidytext)
library(magrittr)
library(ggplot2)
library(dplyr)
library(stringr)
#download Robinson, Frank M books
books_id<-gutenberg_works(author == 'Robinson, Frank M.')[[1]]
books_tbl <- gutenberg_download(books_id,meta_fields = "title")
tidytext <- data_frame(line = 1:nrow(books_tbl), text = books_tbl$text) %>%
unnest_lines(word, text,to_lower = FALSE)
#seperate chapters
chapters<-books_tbl%>%
group_by(title)%>%
mutate(chapter = cumsum((str_detect(text, regex("CHAPTER [ILVX]+|CHAPTER [1-9]+|^[ILVX]+\\."))))
)%>%
group_by(title,chapter)%>%
filter(n()>50)%>%
summarise(text = paste0(text, collapse = " "))
#unigram
unigrams_per_chapter<-chapters%>%
unnest_tokens(unigram, text, token = "ngrams", n = 1)%>%
group_by(title,chapter)%>%
count(unigram)%>%
separate(unigram, c("word"), sep = " ")%>%
filter(!word %in% stop_words$word)
freq_rank_uni_per_chapter<-unigrams_per_chapter%>%
group_by(title,chapter)%>%
arrange(n)%>%
mutate(freq=n/n(),rank=last(row_number())-row_number())%>%
arrange(title,chapter,rank)
#bigram
bigrams_per_chapter<-chapters%>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)%>%
group_by(title,chapter)%>%
count(bigram)%>%
separate(bigram, c("word_1", "word_2"), sep = " ")%>%
filter(!word_1 %in% stop_words$word) %>%
filter(!word_2 %in% stop_words$word)
freq_rank_bi_per_chapter<-bigrams_per_chapter%>%
group_by(title,chapter)%>%
arrange(n)%>%
mutate(freq=n/n(),rank=last(row_number())-row_number())%>%
arrange(title,chapter,rank)
freqs_by_chapter<-
rbind(freq_rank_bi_per_chapter%>%select(rank,freq,title,chapter)%>%mutate(ngram="bigram"),
freq_rank_uni_per_chapter%>%select(rank,freq,title,chapter)%>%mutate(ngram="unigram"))
freqs_by_chapter<-freqs_by_chapter%>%
mutate(title=str_trim(title))
#draw plot
ggplot(freqs_by_chapter,aes(rank,freq,color=ngram))+
geom_point()+
scale_x_log10()+
scale_y_log10()+
facet_wrap(~title)