Explore the data through visualising the seasonality.
[R/project-au-births.git] / Australian-births.R
1 # Analyse births in Australia
2 library(tidyverse)
3 library(ggthemes)
4 library(forecast)
5 AU_births <- read_csv("data/BIRTHS_MONTH_OCCURRENCE_17102017230358770.csv")
6 # Exclude 2015 as data seems incomplete
7 AU_births <- filter(AU_births, !(TIME == 2015))
8 # Remove subtotals in rows
9 AU_births <- filter(AU_births, MONTH_OCCURENCE != "TOT")
10 # Make it a number as it was meant to be
11 AU_births$MONTH_OCCURENCE <- as.numeric(AU_births$MONTH_OCCURENCE)
12 # Add median by month to df
13 AU_births_medians <- AU_births %>%
14   group_by(Region, MONTH_OCCURENCE) %>%
15   summarise(median = median(Value))
16 AU_births <- merge(AU_births, AU_births_medians)
18 # Create a seasonal subseries plot
19 ggplot(filter(AU_births, Region == "Australia"), aes(x= Time, y = Value)) + 
20   geom_line() +
21   geom_line(aes(y = median), color = "red") +
22   facet_grid( ~ MONTH_OCCURENCE) +
23   labs(x = paste("\nFrom", min(AU_births$Time), "to",max(AU_births$Time)),
24        caption = "Data: Australian Bureau of Statistics",
25        y = "Births per month",
26        title = "Births in Australia",
27        subtitle = "Bottoms out in summer months; boomed from 2005/2007") +
28   theme_economist() +
29   theme(axis.text.x = element_blank()) 
31 # Let's check the seasonality
32 # Uses some base R graphics subtleties to right align the caption...
33 AU_births_ts <- as.ts(arrange(filter(AU_births, Region == "Australia"), Time, MONTH_OCCURENCE)$Value)
34 Acf(AU_births_ts, lag.max = 48, main = "")
35     title(main = "ACF plot of births in Australia by month")
36     title(sub = "Data: Australian Bureau of Statistics", adj = 1)
38 # The acf peaks at multiples of 12, showing a yearly seasonality
40 # Seasonality doesn't appear to increase over time
41 #plot(AU_births_ts)
42 #trend_AU_births <- ma(AU_births_ts, order = 12, centre = T) 
43 #plot(trend_AU_births)
44 #detrend_AU_births <- AU_births_ts - trend_AU_births
46 AU_births_decomposed <- decompose(ts(AU_births_ts, frequency = 12))
47 years <- rep(c(min(AU_births$TIME):max(AU_births$TIME)), each = 12)
49 # For ggplot, we prefer a long dataframe over a list of timeseries
50 AU_births_decomposed_df <- as.tibble(cbind(observation = c(1:length(years)),
51                                  years,
52                                  observed = as.vector(AU_births_ts), 
53                                  seasonal = as.vector(AU_births_decomposed$seasonal),
54                                  random = as.vector(AU_births_decomposed$random),
55                                  trend = as.vector(AU_births_decomposed_trend)))
56 # Add a character vector only after the above cbind, to avoid everything becoming a chr or a factor
57 AU_births_decomposed_df <- mutate(AU_births_decomposed_df,
58                                   seastrend = seasonal + trend,
59                                   month = rep(c("01","02","03","04","05","06","07","08","09","10","11","12"), times = 19))  
60 AU_births_decomposed_df <- gather(AU_births_decomposed_df, key = component, value = value, -observation, -years, - month)
62 # Plot the various components of the observed births underneath each other
63 ggplot(filter(AU_births_decomposed_df, component != "seastrend"), aes(x = as.factor(observation), y = as.numeric(value), group  = 1)) +
64   geom_line() +
65   facet_grid(component ~ ., scales = "free_y") +
66   # Hackish way of showing only the year
67   scale_x_discrete(breaks = filter(AU_births_decomposed_df, month == "01")$observation,
68                    labels = filter(AU_births_decomposed_df, month == "01")$years) +
69   theme_fivethirtyeight() +
70   theme(axis.text.x = element_text(angle = 45)) +
71   labs(title = "Births in Australia",
72        subtitle = "Seasonally decomposed",
73        caption = "Data: Australian Bureau of Statistics",
74        x = "",
75        y = "")
77 # Let's look at everything in one plot instead
78 ggplot(AU_births_decomposed_df, aes(x = yearmonth, y = as.numeric(value), group  = 1)) +
79   geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="trend"), color = "orange", size = 2) +
80   geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="observed"), color = "grey50") +
81   geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="seastrend"), color = "black") +
82   scale_x_discrete(breaks = filter(AU_births_decomposed_df, month == "01")$yearmonth,
83                    labels = filter(AU_births_decomposed_df, month == "01")$years) +
84   theme_economist_white() +
85   theme(axis.text.x = element_text(angle = 45)) +
86   labs(title = "Births in Australia",
87        subtitle = "Seasonally decomposed",
88        caption = "Data: Australian Bureau of Statistics",
89        x = "",
90        y = "")
92 # TODO Horizon plot