The purpose of this project is to study which words appear most frequently in street addresses. First we will read in the required libraries.
library(dplyr)
library(knitr)
library(tm)
library(wordcloud2)
Now we will read in the street addresses contained in the Medicare database found at http://tiny.cc/dcf/CMS_ProvidersSimple.rds.
Addresses <- readRDS("Medicare.rds") %>% select(address)
docs <- Corpus(VectorSource(Addresses))
Next we will clean the text.
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs <- tm_map(docs, removeNumbers)
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
Now we will extract the word frequency information to a dataframe and then print the 20 most frequent words.
dtm <- TermDocumentMatrix(docs)
d <- as.data.frame(as.matrix(dtm) )
colnames(d) <- c("freq")
d$word<-rownames(d)
rownames(d) <- 1:nrow(d)
d <- d %>% arrange(desc(freq)) %>% select(word, freq)
d %>% head(20) %>% kable(row.names=TRUE)
word | freq | |
---|---|---|
1 | ave | 171640 |
2 | blvd | 68294 |
3 | ste | 29424 |
4 | street | 22748 |
5 | main | 21937 |
6 | medical | 17505 |
7 | pkwy | 16500 |
8 | park | 15458 |
9 | center | 14413 |
10 | road | 14291 |
11 | hwy | 14037 |
12 | avenue | 11216 |
13 | way | 11036 |
14 | highway | 10934 |
15 | drive | 10249 |
16 | hospital | 9083 |
17 | state | 8044 |
18 | washington | 7242 |
19 | university | 7102 |
20 | north | 7032 |
Finally we will generate a word cloud with the 200 most common words.
d <- d %>% head(200)
wordcloud2(d, size = 2)