TidyTuesday Starbucks Data

2021-12-21 7 min read R

Data

The data I use are available here. Let’s go ✌

I have no initial idea what I want to present and so made several exploratory plots to see what I deal with.

library(tidyverse)
library(magrittr)
library(hermitage)

sb <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-12-21/starbucks.csv')

## Rows: 1147 Columns: 15

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): product_name, size, trans_fat_g, fiber_g
## dbl (11): milk, whip, serv_size_m_l, calories, total_fat_g, saturated_fat_g,...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

sb %>% names(.)

##  [1] "product_name"    "size"            "milk"            "whip"           
##  [5] "serv_size_m_l"   "calories"        "total_fat_g"     "saturated_fat_g"
##  [9] "trans_fat_g"     "cholesterol_mg"  "sodium_mg"       "total_carbs_g"  
## [13] "fiber_g"         "sugar_g"         "caffeine_mg"

sb %>% skimr::skim(.)

Table: Table 1: Data summary


Name	Piped data
Number of rows	1147
Number of columns	15
_______________________
Column type frequency:
character	4
numeric	11
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
product_name	1	8	47	93
size	1	4	7	11
trans_fat_g	1	1	3	7
fiber_g	1	1	2	12

Variable type: numeric

skim_variable	complete_rate	mean	sd	p25	p50	p75	p100	hist
milk	1	2.51	1.68	1.0	2.0	4	5	▇▃▃▃▃
whip	1	0.25	0.43	0.0	0.0	0	1	▇▁▁▁▂
serv_size_m_l	1	461.34	172.18	354.0	473.0	591	887	▁▇▆▆▁
calories	1	228.39	137.67	130.0	220.0	320	640	▆▇▆▃▁
total_fat_g	1	6.19	5.97	1.0	4.5	10	28	▇▃▂▁▁
saturated_fat_g	1	3.88	4.01	0.2	2.5	7	20	▇▃▂▁▁
cholesterol_mg	1	15.24	17.97	0.0	5.0	30	75	▇▂▂▁▁
sodium_mg	1	139.65	93.09	70.0	135.0	200	370	▇▇▇▃▂
total_carbs_g	1	37.72	23.26	20.0	37.0	53	96	▇▇▇▅▂
sugar_g	1	34.99	22.46	18.0	34.0	49	89	▇▇▇▅▂
caffeine_mg	1	91.86	78.11	30.0	75.0	150	475	▇▃▁▁▁

Exploratory plots

# calories by drink size
sb %>% 
  mutate(
    size = factor(size),
    size = fct_reorder(size, calories),
    alpha = scales::rescale(calories, to = 0, 1)
  ) %>% 
ggplot(aes(x = size, y = calories, color = size)) +
  geom_point(size = 2, alpha = 0.7, shape = 5) +
  theme_void(base_size = 15, base_family = "Varela Round") +
  theme(
    plot.background = element_rect(fill = "#FCD3B7"),
    text = element_text(color = "#50270A"),
    axis.title.y = element_text(color = "#50270A", angle = 90),
    axis.text.y = element_text(color = "#50270A"),
    legend.position = "none",
    legend.box.margin = margin(2, 2, 2, 2),
    axis.text.x = element_text(color = "#50270A"),
    panel.spacing = unit(5, "lines"),
    plot.margin = margin(15, 15, 15, 15),
    plot.caption = element_text(hjust = 0)
  ) +
  scale_color_manual(values = hermitage_palette("cottages_vincent")) +
  labs(title = "Energy (kcal) by drink size", y = "energy, kcal",
       caption = "Source | Starbucks data | for TidyTuesday\nplot | Elena Dudukina | @evpatora\ncolors | package{hermitage}")

# calories by drink size and sugar
sb %>% 
  mutate(
    size = factor(size),
    size = fct_reorder(size, calories)
  ) %>% 
  filter(calories > 200)  %>% 
  ggplot(aes(x = sugar_g, y = total_fat_g, color = size)) +
  geom_point(size = 2, alpha = 0.9, shape = 5) +
  facet_wrap(~size) +
  theme_void(base_size = 13, base_family = "Varela Round") +
  theme(
    plot.background = element_rect(fill = "#FCD3B7"),
    text = element_text(color = "#50270A"),
    axis.title.y = element_text(color = "#50270A", angle = 90),
    axis.title.x = element_text(color = "#50270A"),
    axis.text.y = element_text(color = "#50270A"),
    axis.text.x = element_text(color = "#50270A"),
    legend.position = "none",
    legend.box.margin = margin(2, 2, 2, 2),
    panel.spacing = unit(5, "lines"),
    plot.margin = margin(15, 15, 15, 15),
    plot.caption = element_text(hjust = 0)
  ) +
  scale_color_manual(values = hermitage_palette("cottages_vincent")) +
  labs(title = "Energy (kcal) by drink total fat (g) and sugar (g) added", y = "total fat, g", x = "sugar, g",
       caption = "Source | Starbucks data | for TidyTuesday\nplot | Elena Dudukina | @evpatora\ncolors | package{hermitage}")

Another exploratory graph.

# calories by product and sugar for beverages with > 200 kcal
sb %>% 
  mutate(
    product_name = factor(product_name),
    product_name = fct_reorder(product_name, calories)
  ) %>% 
  filter(calories > 200)  %>% 
  ggplot(aes(x = sugar_g, y = calories, color = product_name)) +
  geom_point(size = 1, shape = 5) +
  facet_wrap(~product_name, labeller = label_wrap_gen(15)) +
  theme_void(base_size = 13, base_family = "Varela Round") +
  theme(
    plot.background = element_rect(fill = "#FFEDE1"),
    text = element_text(color = "#50270A"),
    axis.title.y = element_text(color = "#50270A", angle = 90),
    axis.title.x = element_text(color = "#50270A"),
    axis.text.y = element_text(color = "#50270A"),
    legend.position = "none",
    axis.text.x = element_text(color = "#50270A"),
    panel.spacing = unit(3, "lines"),
    plot.margin = margin(15, 15, 15, 15),
    plot.caption = element_text(hjust = 0)
  ) +
  scale_color_manual(values = hermitage_palette("parsons_2")) +
  labs(title = "Energy (kcal) by product and sugar (g) added", y = "energy, kcal", x = "sugar, g",
       caption = "Source | Starbucks data | for TidyTuesday\nplot | Elena Dudukina | @evpatora\ncolors | package{hermitage}")

It should come as no surprise that amount and type of milk, amount of sugar and the beverage size determine the overall kcal content in the cup. Why not leverage the power of ggplot to make a graph that tells me what kinds of hot lattes I can get given that I want maximum amount of caffeine and minimum amount of sugar because this is how I like my lattes. In winter season, iced coffee is a deal-breaker and > 15 g sugar per beverage is also a deal-breaker. Don’t worry, I will compensate for that with having more cookies 🍪

Final graph

# most caffeinated and not so sweet hot lattes

sb_caf_rank <- sb %>%
  mutate(
    product_name = fct_reorder(product_name, caffeine_mg)
  ) %>%
  group_by(product_name, milk, calories) %>%
  arrange(caffeine_mg) %>%
  mutate(
    label = case_when(
      str_detect(product_name, "Iced") ~ NA_character_,
      sugar_g > 15 ~ NA_character_,
      caffeine_mg == max(caffeine_mg) & calories == min(calories) & sugar_g <= 15 & str_detect(product_name, "Iced", negate = T) & str_detect(product_name, "Latte") ~ paste0(product_name, " | ", size, " | ", sugar_g, " g sugar | ", total_fat_g, " g fat"),
      T ~ NA_character_
    ),
    point = case_when(
      !is.na(label) ~ "4",
      T ~ "3"
    )
  ) %>%
  ungroup()  %>%
  mutate(label = case_when(
    caffeine_mg == max(caffeine_mg) ~ paste0("MOST CAFFEINATED BEVERAGE IN STARBUCKS\n", product_name, " | ", size, " | ", sugar_g, " g sugar | ", total_fat_g, " g fat"),
    T ~ label
  ))

sb_plot <- sb_caf_rank  %>%
  ggplot(aes(x = calories, y = caffeine_mg, color = calories, fill = calories)) +
  geom_point(aes(shape = point, size = point), alpha = 0.55) +
  facet_wrap(~milk) +
  theme_void(base_size = 14, base_family = "Varela Round") +
  theme(
    plot.background = element_rect(fill = "#FFEDE1"),
    text = element_text(color = "#50270A"),
    axis.title.y = element_text(color = "#50270A", angle = 90),
    axis.title.x = element_text(color = "#50270A"),
    axis.text.y = element_text(color = "#50270A"),
    axis.text.x = element_text(color = "#50270A"),
    legend.position = "bottom",
    legend.margin = margin(10, 10, 10, 10),
    panel.spacing = unit(3, "lines"),
    plot.margin = margin(15, 15, 15, 15),
    plot.caption = element_text(hjust = 0)
  ) +
  ggrepel::geom_text_repel(mapping = aes(label = label), color = "#00704A", segment.curvature = -0.6,
                           nudge_x = 500, nudge_y = 20, point.size = 10, fontface = "bold",
                           segment.linetype = 6, direction = "y", hjust = "left", size = 2) +
  scale_color_gradientn(colours = hermitage_palette("faberge", "continuous", n = 71)) +
  scale_fill_gradientn(colours = hermitage_palette("faberge", "continuous", n = 71)) +
  guides(color = "none", shape = "none", size = "none", fill = guide_colorbar(title = "Energy, kcal")) +
  labs(title = "Caffeinated & Not So Sweet Hot Lattes You Can Get in Starbucks",
       subtitle = "by milk type",
       y = "caffeine, mg", x = "energy, kcal",
       caption = "Source | Starbucks data | for TidyTuesday\nplot | Elena Dudukina | @evpatora\ncolors | package{hermitage}\nIced lattes and any lattes with > 15 g of sugar are not labeleld")

sb_plot

## Warning: Using size for a discrete variable is not advised.

## Warning: Removed 1102 rows containing missing values (geom_text_repel).

ggsave(sb_plot, filename = "starbucks.jpeg", dpi = 400, units = "cm", width = 30, height = 31, path = path)

## Warning: Using size for a discrete variable is not advised.

## Warning: Removed 1102 rows containing missing values (geom_text_repel).

Contents

dataviz Rstats tidyverse TidyTuesday ggplot2

TidyTuesday Starbucks Data

Data

Exploratory plots

Final graph

Elena Dudukina, MD, MSc, PhD

Senior Epidemiologist

Related