In this project, we will compare bigrams and word frequencies in twitter messages from Trump and Obama. First we will read in the required libraries.
library(dplyr)
library(knitr)
library(purrr)
library(twitteR)
library(tidytext)
library(stringr)
library(wordcloud2)
library(tidyr)
library(ggplot2)
library(igraph)
library(ggraph)
To set up TwitteR, please see R Bloggers.
Now we’ll load the last 3200 tweets from each of Trump and Obama. Note the number of tweets will be less since we are excluding retweets.
trump_tweets <- userTimeline("realDonaldTrump", n=3200)
trump_tweets_df <- tbl_df(map_df(trump_tweets, as.data.frame))
obama_tweets <- userTimeline("BarackObama", n=3200)
obama_tweets_df <- tbl_df(map_df(obama_tweets, as.data.frame))
First we’ll look at bigrams contained in Trump’s twitter messages.
count_bigrams <- function(dataset) {
dataset %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
}
visualize_bigrams <- function(bigrams) {
set.seed(5)
a <- grid::arrow(type = "closed", length = unit(0.1, "inches"))
bigrams %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(size = 10, color = "blue"), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 16) +
geom_node_text(aes(label = name, size = 20), repel = TRUE) +
theme_void() +
theme(legend.position="none")
}
trump_tweets_df %>% count_bigrams() %>% head(50) %>% visualize_bigrams()
Now we’ll count word frequencies and create a word cloud.
tweets <- trump_tweets_df
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- tweets %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
tweet_words <- tweet_words %>% group_by(word) %>% tally() %>% mutate(freq = n) %>% select(word, freq) %>% arrange(desc(freq))
tweet_words %>% head(20) %>% kable(row.names=TRUE)
word | freq | |
---|---|---|
1 | fake | 47 |
2 | news | 47 |
3 | media | 44 |
4 | people | 42 |
5 | jobs | 39 |
6 | america | 37 |
7 | obamacare | 36 |
8 | time | 35 |
9 | election | 33 |
10 | country | 30 |
11 | democrats | 30 |
12 | bad | 28 |
13 | trump | 28 |
14 | american | 27 |
15 | healthcare | 27 |
16 | join | 26 |
17 | president | 26 |
18 | russia | 25 |
19 | honor | 23 |
20 | watch | 22 |
wordcloud2(tweet_words, size = 0.6)
Now we’ll analyze sentiment.
trump_sentiment <- tweet_words %>%
inner_join(get_sentiments("bing"))
trump_sentiment %>%
group_by(sentiment) %>%
top_n(10, freq) %>%
arrange(freq) %>%
ungroup() %>%
mutate(word = reorder(word, freq)) %>%
ggplot(aes(word, freq, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
Now we’ll look at bigrams contained in Obama’s twitter messages.
count_bigrams <- function(dataset) {
dataset %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
}
visualize_bigrams <- function(bigrams) {
set.seed(5)
a <- grid::arrow(type = "closed", length = unit(0.1, "inches"))
bigrams %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(size = 10, color = "blue"), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 16) +
geom_node_text(aes(label = name, size = 20), repel = TRUE) +
theme_void() +
theme(legend.position="none")
}
obama_tweets_df %>% count_bigrams() %>% head(50) %>% visualize_bigrams()
Now we’ll count word frequencies and create a word cloud.
tweets <- obama_tweets_df
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- tweets %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
tweet_words <- tweet_words %>% group_by(word) %>% tally() %>% mutate(freq = n) %>% select(word, freq) %>% arrange(desc(freq))
tweet_words %>% head(20) %>% kable(row.names=TRUE)
word | freq | |
---|---|---|
1 | senate | 68 |
2 | #doyourjob | 61 |
3 | leaders | 57 |
4 | president | 50 |
5 | obama | 48 |
6 | court | 43 |
7 | supreme | 42 |
8 | #actonclimate | 40 |
9 | climate | 39 |
10 | change | 35 |
11 | judge | 33 |
12 | garland | 27 |
13 | hearing | 25 |
14 | add | 21 |
15 | economy | 20 |
16 | progress | 20 |
17 | agree | 19 |
18 | join | 19 |
19 | fair | 18 |
20 | @ofa | 18 |
wordcloud2(tweet_words, size = 0.6)
Now we’ll analyze sentiment.
obama_sentiment <- tweet_words %>%
inner_join(get_sentiments("bing"))
obama_sentiment %>%
group_by(sentiment) %>%
top_n(10, freq) %>%
arrange(freq) %>%
ungroup() %>%
mutate(word = reorder(word, freq)) %>%
ggplot(aes(word, freq, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()