The purpose of this project is to study which words appear most frequently in street addresses. First we will read in the required libraries.

library(dplyr)
library(knitr)
library(tm)
library(wordcloud2)

Now we will read in the street addresses contained in the Medicare database found at http://tiny.cc/dcf/CMS_ProvidersSimple.rds.

Addresses <- readRDS("Medicare.rds") %>% select(address)
docs <- Corpus(VectorSource(Addresses))

Next we will clean the text.

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs <- tm_map(docs, removeNumbers)

# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)

Now we will extract the word frequency information to a dataframe and then print the 20 most frequent words.

dtm <- TermDocumentMatrix(docs)
d <- as.data.frame(as.matrix(dtm) )
colnames(d) <- c("freq")
d$word<-rownames(d)
rownames(d) <- 1:nrow(d)
d <- d %>% arrange(desc(freq)) %>% select(word, freq)

d %>% head(20) %>% kable(row.names=TRUE)
word freq
1 ave 171640
2 blvd 68294
3 ste 29424
4 street 22748
5 main 21937
6 medical 17505
7 pkwy 16500
8 park 15458
9 center 14413
10 road 14291
11 hwy 14037
12 avenue 11216
13 way 11036
14 highway 10934
15 drive 10249
16 hospital 9083
17 state 8044
18 washington 7242
19 university 7102
20 north 7032

Finally we will generate a word cloud with the 200 most common words.

d <- d %>% head(200)

wordcloud2(d, size = 2)