-
Notifications
You must be signed in to change notification settings - Fork 5
/
main.R
124 lines (103 loc) · 4.55 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# REQUIRED LIBRARIES
library(wordcloud)
library(lubridate)
library(rvest)
library(tm)
library(tidyverse)
library(plotly)
# READ DATA
fileHTML <- "Takeout/My Activity/Search/MyActivity.html"
mySearchFile <- read_html(fileHTML, encoding = "UTF-8")
# SCRAPPING SEARCH DATE AND TIME
dateSearch <- mySearchFile %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = "(?<=<br>)(.*)(?<=PM|AM)") %>%
mdy_hms()
dateSearch[1:5]
# SCRAPING SEARCH TEXT
textSearch <- mySearchFile %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = '(?<=<a)(.*)(?=</a>)') %>%
str_extract(pattern = '(?<=\">)(.*)')
textSearch[1:5]
# SCRAPING SEARCH TYPE
searchType <- mySearchFile %>%
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
str_extract(pattern = "(?<=mdl-typography--body-1\">)(.*)(?=<a)") %>%
str_extract(pattern = "(\\w+)(?=\\s)")
searchType[1:5]
# CREATE DATA FRAME USING SCRAPED DATA
searchedData <- tibble(timestamp = dateSearch,
date = as_date(dateSearch),
year = year(dateSearch),
month = month(dateSearch, label = TRUE),
day = weekdays(dateSearch),
hour = hour(dateSearch),
type = searchType,
search = textSearch)
searchedData$day <- factor(searchedData$day, levels = c("Sunday", "Monday", "Tuesday",
"Wednesday","Thursday", "Friday",
"Saturday"))
searchedData <- na.omit(searchedData)
head(searchedData)
# PLOT SEARCH VOLUME BY YEAR
searchByYear <- ggplot(searchedData, aes(year, fill=..count..)) +
scale_fill_gradient(low = "yellow", high = "red")+
geom_bar(width=0.7)+
labs(x= "Year", y= "Count") +
ggtitle("How much your search frequency has changed over time", "Search activity by year")
searchByYear
ggplotly()
# PLOT SEARCH VOLUME BY MONTH
searchByMonth <- searchedData[(searchedData$year > 2007 & searchedData$year< 2021), ]
ggplot(searchByMonth, aes(year, fill=..count..)) +
scale_fill_gradient(low = "yellow", high = "red")+
geom_bar(aes(x = month, group = year)) +
theme(axis.text.x = element_text(angle=90)) +
facet_grid(.~year, scales="free") +
labs(x= "Year / Month", y= "Count") +
ggtitle("How much your search frequency has changed over time", "Month activity on detail")
ggplotly()
# PLOT SEARCH VOLUME BY HOUR
seearchByHour <- ggplot(searchedData, aes(hour, fill=..count..)) +
scale_fill_gradient(low = "yellow", high = "red") +
geom_bar() +
labs(x= "Hour", y= "Count") +
ggtitle("What time of day do you have the highest frequency of searches?", "Hour activity on detail")
seearchByHour
ggplotly()
# PLOT SEARCH VOLUME BY WEEKDAY
seearchByWeekD <- ggplot(searchedData, aes(day, fill=..count..)) +
scale_fill_gradient(low = "yellow", high = "red") +
geom_bar() +
labs(x= "Day", y= "Count") +
ggtitle("What day of the week do you have the highest frequency of searches?", "Weekday activity on detail")
seearchByWeekD
ggplotly()
# PLOT SEARCH VOLUME BY WEEKDAY AND TIME
searchWdayTime <- ggplot(searchedData) +
scale_fill_gradient(low = "yellow", high = "red")+
geom_bar(aes(x = hour, group = day, fill=..count..) ) +
labs(x= "Hour / Day", y= "Count") +
ggtitle("Relationship between day / time you have a higher frequency of searches", "Weekday/Time activity on detail") +
facet_grid(.~day, scales = "free")
searchWdayTime
ggplotly()
# CLEAN AND EXTRACT TEXT TO CREATE A TEXT CORPUS
lastTwoYears <- searchedData[(searchedData$year > 2007 & searchedData$year< 2010), ]
search <- tolower(lastTwoYears$search)
search <- gsub('(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"', " ", search)
search <- gsub("(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]", " ", search)
search <- trimws(search)
textCorpus <- Corpus(VectorSource(search))
textCorpus <- tm_map(textCorpus, content_transformer(removePunctuation))
textCorpus <- tm_map(textCorpus, content_transformer(removeNumbers))
stopwords <- c(stopwords("english"), "que", "com", "cómo", "como", "para", "con", "qué", "las", "los", "del", "can")
textCorpus <- tm_map(textCorpus, removeWords, stopwords)
searchTDM <- TermDocumentMatrix(textCorpus)
searchMatrix <- as.matrix(searchTDM)
# CREATE DATA FRAME WITH WORDS
arrange <- sort(rowSums(searchMatrix), decreasing = TRUE)
twNames <- names(arrange)
dataCloud <- data.frame(word = twNames, freq = arrange)
wordcloud(dataCloud$word, dataCloud$freq, min.freq = 40, scale = c(2 , 0.5), max.words = 100, colors=brewer.pal(9, "Paired"))