6.10.2016

Letters in English dictionary

Found rather big English dictionary to work with (here) and became curious about letters frequencies. One thing to note is that what I count here is not frequencies in language, but frequencies in dictionary, which are, generally speaking, different things.

library(ggplot2)
library(magrittr)
library(dplyr)
words.eng <- readLines("words.txt", warn = F)
## first letters
firstL <- letters %>%
        sapply(function(x) paste0("^", x) %>% grep(words.eng) %>% length) %>%
        as.data.frame() %>% add_rownames() %>%
        `names<-`(c('letter','count')) %>%
        mutate(percent = 100*count/sum(count))

## last letter
lastL <- letters %>%
        sapply(function(x) paste0(x, "$") %>% grep(words.eng) %>% length) %>%
        as.data.frame() %>% add_rownames() %>%
        `names<-`(c('letter','count')) %>%
        mutate(percent = 100*count/sum(count))

## overall letters count
anyL <- words.eng %>% strsplit("") %>% unlist() %>% table() %>%
        as.data.frame() %>% `names<-`(c('letter','count')) %>%
        filter(letter %in% letters) %>%
        mutate(percent = 100*count/sum(count))

## plot
my_plot <- function(data, title){
    ggplot(data, aes(x = letter, y = percent)) +
        geom_bar(stat = "identity", fill = "#C19A6B") +
        geom_text(aes(label = letter), vjust = -0.5, size = 6) +
        scale_x_discrete(name = NULL, breaks = NULL) +
        scale_y_continuous(name = NULL, limits = c(0,max(data$percent) + 0.5)) +
        ggtitle(title) +
        theme(plot.title = element_text(size = 20, face="bold",vjust = 0.2),
              axis.text.y = element_text(size = 15)
        )
}
my_plot(firstL, "First letters in English dictionary (%)")

my_plot(lastL, "Last letters in English dictionary (%)")

my_plot(anyL, "Letters in English dictionary (%)")