In this project, we will compare bigrams and word frequencies in twitter messages from Trump and Obama. First we will read in the required libraries.

library(dplyr)
library(knitr)
library(purrr)
library(twitteR)
library(tidytext)
library(stringr)
library(wordcloud2)
library(tidyr)
library(ggplot2)
library(igraph)
library(ggraph)

To set up TwitteR, please see R Bloggers.

Now we’ll load the last 3200 tweets from each of Trump and Obama. Note the number of tweets will be less since we are excluding retweets.

trump_tweets <- userTimeline("realDonaldTrump", n=3200)
trump_tweets_df <- tbl_df(map_df(trump_tweets, as.data.frame))

obama_tweets <- userTimeline("BarackObama", n=3200)
obama_tweets_df <- tbl_df(map_df(obama_tweets, as.data.frame))

For Trump

Bigrams

First we’ll look at bigrams contained in Trump’s twitter messages.

count_bigrams <- function(dataset) {
  dataset %>%
    filter(!str_detect(text, '^"')) %>%
    mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
    count(word1, word2, sort = TRUE)
}
visualize_bigrams <- function(bigrams) {
  set.seed(5)
  a <- grid::arrow(type = "closed", length = unit(0.1, "inches"))
  bigrams %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(size = 10, color = "blue"), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 16) +
    geom_node_text(aes(label = name, size = 20), repel = TRUE) +
    theme_void() +
    theme(legend.position="none")
}
trump_tweets_df  %>% count_bigrams()  %>% head(50) %>% visualize_bigrams()

Word Frequencies

Now we’ll count word frequencies and create a word cloud.

tweets <- trump_tweets_df

reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- tweets %>%
  filter(!str_detect(text, '^"')) %>%
  mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
  unnest_tokens(word, text, token = "regex", pattern = reg) %>%
  filter(!word %in% stop_words$word,
         str_detect(word, "[a-z]"))

tweet_words <- tweet_words %>% group_by(word) %>% tally() %>% mutate(freq = n) %>% select(word, freq) %>% arrange(desc(freq))
tweet_words %>% head(20) %>% kable(row.names=TRUE)

	word	freq
1	fake	47
2	news	47
3	media	44
4	people	42
5	jobs	39
6	america	37
7	obamacare	36
8	time	35
9	election	33
10	country	30
11	democrats	30
12	bad	28
13	trump	28
14	american	27
15	healthcare	27
16	join	26
17	president	26
18	russia	25
19	honor	23
20	watch	22

wordcloud2(tweet_words, size = 0.6)

Sentiment

Now we’ll analyze sentiment.

trump_sentiment <- tweet_words %>%
  inner_join(get_sentiments("bing"))

trump_sentiment %>%
  group_by(sentiment) %>%
  top_n(10, freq) %>%
  arrange(freq) %>%
  ungroup() %>%
  mutate(word = reorder(word, freq)) %>%
  ggplot(aes(word, freq, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

For Obama

Bigrams

Now we’ll look at bigrams contained in Obama’s twitter messages.

count_bigrams <- function(dataset) {
  dataset %>%
    filter(!str_detect(text, '^"')) %>%
    mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
    count(word1, word2, sort = TRUE)
}
visualize_bigrams <- function(bigrams) {
  set.seed(5)
  a <- grid::arrow(type = "closed", length = unit(0.1, "inches"))
  bigrams %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(size = 10, color = "blue"), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 16) +
    geom_node_text(aes(label = name, size = 20), repel = TRUE) +
    theme_void() +
    theme(legend.position="none")
}
obama_tweets_df  %>% count_bigrams()  %>% head(50) %>% visualize_bigrams()

Word Frequencies

Now we’ll count word frequencies and create a word cloud.

tweets <- obama_tweets_df

reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- tweets %>%
  filter(!str_detect(text, '^"')) %>%
  mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
  unnest_tokens(word, text, token = "regex", pattern = reg) %>%
  filter(!word %in% stop_words$word,
         str_detect(word, "[a-z]"))

tweet_words <- tweet_words %>% group_by(word) %>% tally() %>% mutate(freq = n) %>% select(word, freq) %>% arrange(desc(freq))

tweet_words %>% head(20) %>% kable(row.names=TRUE)

	word	freq
1	senate	68
2	#doyourjob	61
3	leaders	57
4	president	50
5	obama	48
6	court	43
7	supreme	42
8	#actonclimate	40
9	climate	39
10	change	35
11	judge	33
12	garland	27
13	hearing	25
14	add	21
15	economy	20
16	progress	20
17	agree	19
18	join	19
19	fair	18
20	@ofa	18

wordcloud2(tweet_words, size = 0.6)

Sentiment

Now we’ll analyze sentiment.

obama_sentiment <- tweet_words %>%
  inner_join(get_sentiments("bing"))

obama_sentiment %>%
  group_by(sentiment) %>%
  top_n(10, freq) %>%
  arrange(freq) %>%
  ungroup() %>%
  mutate(word = reorder(word, freq)) %>%
  ggplot(aes(word, freq, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

Trump vs Obama Twitter Analysis

Oliver Thistlethwaite

For Trump

Bigrams

Word Frequencies

Sentiment

For Obama

Bigrams

Word Frequencies

Sentiment