CSB_2019

Lecture notes and exercises for Computing Skills for Biologists --- Winter 2019

View project on GitHub

Possible solution to warmup problem Week 8

library(tidyverse)
# read the data
dt <- read_csv("https://tinyurl.com/ycc4ndkd") %>% select(-state)

# 1. Find the number of distinct names for boys and girls.

ex1 <- dt %>% select(sex, name) %>% 
  distinct() %>% group_by(sex) %>% tally()
print(ex1)

# 2. Find the name/year combination with highest `count` for each `sex`.

ex2 <- dt %>% group_by(sex) %>% top_n(1, count)
print(ex2)

# 3. Plot the number of distinct names for `M` or `F` in time.

ex3 <- dt %>% ggplot() + 
  aes(x = year, fill = sex) + 
  geom_bar(position = "dodge")
show(ex3)

# 4. As you can see, the diversity of names has been growing. 
# Plot the number of "effective" names in time. 
# First, compute Shannon's entropy for each year/sex. 
# Then exponentiate to obtain the number of names that 
# would produce the same entropy while having equal frequency. 

ent <- dt %>% group_by(sex, year) %>% 
  mutate(p = count / sum(count)) %>% 
  summarise(entropy = -sum(p * log(p)))

ex4 <- ent %>% ggplot() + 
  aes(x = year, colour = sex, y = exp(entropy)) + 
  geom_point() + geom_line() + 
  ylab("Number of effective names")
show(ex4)

# 5. Plot the difference in number of distinct names between `F` and `M` in time.

diff_unique_names <- dt %>% 
  group_by(sex, year) %>% 
  tally() %>% 
  spread(sex, n) %>% 
  mutate(difference = `F` - `M`)

ex5 <- ggplot(diff_unique_names) + 
  aes(x = year, y = difference) + 
  geom_point() + geom_line()
show(ex5)

# 6. Find the frequency with wich each name ends with a certain letter. 
# Show a barplot for the probability that a name (for boys or girls) 
# ends with a given letter. Use years 1950, 1975, 2000 and 2015.

last_letter <- dt %>% mutate(last_letter = stringr::str_sub(name, -1)) %>% 
  group_by(year, sex, last_letter) %>% 
  summarise(tot = sum(count)) 
last_letter <- last_letter %>% 
  group_by(year, sex) %>% 
  mutate(prob = tot / sum(tot))

ex6 <- ggplot(data = last_letter %>% 
                filter(year %in% c(1950, 1975, 2000, 2015))) + 
  aes(x = last_letter, y = prob) + 
  geom_col() + facet_grid(sex ~ year)
show(ex6)

# 7. Plot the popularity of each letter for ending names in time, for boys and girls. 
# Which letters are growing, which declining?
  
ex7 <- ggplot(data = last_letter) + 
  aes(x = year, y = prob, label = last_letter, colour = last_letter) + 
  geom_text() + geom_line() + facet_wrap(~sex) + scale_y_log10()
show(ex7)

# 8. Find the most popular names today that were not present in the 1950 data, and viceversa.
dt2015 <- dt %>% filter(year == 2015) %>%
  group_by(sex, name) %>%
  summarise(tot = sum(count)) %>%
  arrange(desc(tot))
dt1950 <- dt %>% filter(year == 1950) %>%
  group_by(sex, name) %>%
  summarise(tot = sum(count)) %>%
  arrange(desc(tot))

in_2015_not_1950 <- dt2015 %>% anti_join(dt1950 %>% select(name, sex))
print(in_2015_not_1950 %>% group_by(sex) %>% top_n(10, tot) %>% arrange(sex, desc(tot)))

in_1950_not_2015 <- dt1950 %>% anti_join(dt2015 %>% select(name, sex))
print(in_1950_not_2015 %>% group_by(sex) %>% top_n(10, tot) %>% arrange(sex, desc(tot)))

# 9. Track particular names through time
plot_frequency_name <- function(dt, myname = "Elvis", highlight = c(1956, 1977)){
  tmp <- dt %>% filter(name == myname) 
  pl <- tmp %>% ggplot() + aes(x = year, y = per_100k_within_sex, colour = sex, group = sex) + geom_line()
  pl <- pl + geom_point(data = tmp %>% filter(year %in% highlight), colour = "black") + ggtitle(myname)
  show(pl)
}

plot_frequency_name(dt, "Elvis", c(1956, 1977)) # debut, death
plot_frequency_name(dt, "Neo", c(1999)) # The Matrix
plot_frequency_name(dt, "Madonna", c(1983)) # debut
plot_frequency_name(dt, "Hermione", c(2001)) # first HP movie
plot_frequency_name(dt, "Diana", c(1981, 1997)) # wedding, death
plot_frequency_name(dt, "Rocky", c(1976)) # first movie of the series
plot_frequency_name(dt, "Amelie", c(2001)) # movie
plot_frequency_name(dt, "Osama", c(2001)) # 9/11
plot_frequency_name(dt, "Daenerys", c(2012)) # Game of thrones
plot_frequency_name(dt, "Tiger", c(1997)) # first masters tournament
plot_frequency_name(dt, "Scarlett", c(2003)) # lost in translation
plot_frequency_name(dt, "Gwyneth", c(1996)) # Emma
plot_frequency_name(dt, "Kanye", c(2004)) # College dropout
plot_frequency_name(dt, "Alanis", c(1995)) # Jagged little pill
plot_frequency_name(dt, "Macaulay", c(1990)) # Home alone
plot_frequency_name(dt, "Miley", c(2006)) # Hannah Montana