Found rather big English dictionary to work with (here) and became curious about letters frequencies. One thing to note is that what I count here is not frequencies in language, but frequencies in dictionary, which are, generally speaking, different things.
library(ggplot2)
library(magrittr)
library(dplyr)
words.eng <- readLines("words.txt", warn = F)
## first letters
firstL <- letters %>%
sapply(function(x) paste0("^", x) %>% grep(words.eng) %>% length) %>%
as.data.frame() %>% add_rownames() %>%
`names<-`(c('letter','count')) %>%
mutate(percent = 100*count/sum(count))
## last letter
lastL <- letters %>%
sapply(function(x) paste0(x, "$") %>% grep(words.eng) %>% length) %>%
as.data.frame() %>% add_rownames() %>%
`names<-`(c('letter','count')) %>%
mutate(percent = 100*count/sum(count))
## overall letters count
anyL <- words.eng %>% strsplit("") %>% unlist() %>% table() %>%
as.data.frame() %>% `names<-`(c('letter','count')) %>%
filter(letter %in% letters) %>%
mutate(percent = 100*count/sum(count))
## plot
my_plot <- function(data, title){
ggplot(data, aes(x = letter, y = percent)) +
geom_bar(stat = "identity", fill = "#C19A6B") +
geom_text(aes(label = letter), vjust = -0.5, size = 6) +
scale_x_discrete(name = NULL, breaks = NULL) +
scale_y_continuous(name = NULL, limits = c(0,max(data$percent) + 0.5)) +
ggtitle(title) +
theme(plot.title = element_text(size = 20, face="bold",vjust = 0.2),
axis.text.y = element_text(size = 15)
)
}
my_plot(firstL, "First letters in English dictionary (%)")
my_plot(lastL, "Last letters in English dictionary (%)")
my_plot(anyL, "Letters in English dictionary (%)")
No comments:
Post a Comment