Explore the data through visualising the seasonality.
[R/project-au-births.git] / Australian-births.R
diff --git a/Australian-births.R b/Australian-births.R
new file mode 100644 (file)
index 0000000..c810586
--- /dev/null
@@ -0,0 +1,92 @@
+# Analyse births in Australia
+library(tidyverse)
+library(ggthemes)
+library(forecast)
+AU_births <- read_csv("data/BIRTHS_MONTH_OCCURRENCE_17102017230358770.csv")
+# Exclude 2015 as data seems incomplete
+AU_births <- filter(AU_births, !(TIME == 2015))
+# Remove subtotals in rows
+AU_births <- filter(AU_births, MONTH_OCCURENCE != "TOT")
+# Make it a number as it was meant to be
+AU_births$MONTH_OCCURENCE <- as.numeric(AU_births$MONTH_OCCURENCE)
+# Add median by month to df
+AU_births_medians <- AU_births %>%
+  group_by(Region, MONTH_OCCURENCE) %>%
+  summarise(median = median(Value))
+AU_births <- merge(AU_births, AU_births_medians)
+
+# Create a seasonal subseries plot
+ggplot(filter(AU_births, Region == "Australia"), aes(x= Time, y = Value)) + 
+  geom_line() +
+  geom_line(aes(y = median), color = "red") +
+  facet_grid( ~ MONTH_OCCURENCE) +
+  labs(x = paste("\nFrom", min(AU_births$Time), "to",max(AU_births$Time)),
+       caption = "Data: Australian Bureau of Statistics",
+       y = "Births per month",
+       title = "Births in Australia",
+       subtitle = "Bottoms out in summer months; boomed from 2005/2007") +
+  theme_economist() +
+  theme(axis.text.x = element_blank()) 
+
+# Let's check the seasonality
+# Uses some base R graphics subtleties to right align the caption...
+AU_births_ts <- as.ts(arrange(filter(AU_births, Region == "Australia"), Time, MONTH_OCCURENCE)$Value)
+Acf(AU_births_ts, lag.max = 48, main = "")
+    title(main = "ACF plot of births in Australia by month")
+    title(sub = "Data: Australian Bureau of Statistics", adj = 1)
+
+# The acf peaks at multiples of 12, showing a yearly seasonality
+
+# Seasonality doesn't appear to increase over time
+#plot(AU_births_ts)
+#trend_AU_births <- ma(AU_births_ts, order = 12, centre = T) 
+#plot(trend_AU_births)
+#detrend_AU_births <- AU_births_ts - trend_AU_births
+
+AU_births_decomposed <- decompose(ts(AU_births_ts, frequency = 12))
+years <- rep(c(min(AU_births$TIME):max(AU_births$TIME)), each = 12)
+
+# For ggplot, we prefer a long dataframe over a list of timeseries
+AU_births_decomposed_df <- as.tibble(cbind(observation = c(1:length(years)),
+                                 years,
+                                 observed = as.vector(AU_births_ts), 
+                                 seasonal = as.vector(AU_births_decomposed$seasonal),
+                                 random = as.vector(AU_births_decomposed$random),
+                                 trend = as.vector(AU_births_decomposed_trend)))
+# Add a character vector only after the above cbind, to avoid everything becoming a chr or a factor
+AU_births_decomposed_df <- mutate(AU_births_decomposed_df,
+                                  seastrend = seasonal + trend,
+                                  month = rep(c("01","02","03","04","05","06","07","08","09","10","11","12"), times = 19))  
+AU_births_decomposed_df <- gather(AU_births_decomposed_df, key = component, value = value, -observation, -years, - month)
+
+# Plot the various components of the observed births underneath each other
+ggplot(filter(AU_births_decomposed_df, component != "seastrend"), aes(x = as.factor(observation), y = as.numeric(value), group  = 1)) +
+  geom_line() +
+  facet_grid(component ~ ., scales = "free_y") +
+  # Hackish way of showing only the year
+  scale_x_discrete(breaks = filter(AU_births_decomposed_df, month == "01")$observation,
+                   labels = filter(AU_births_decomposed_df, month == "01")$years) +
+  theme_fivethirtyeight() +
+  theme(axis.text.x = element_text(angle = 45)) +
+  labs(title = "Births in Australia",
+       subtitle = "Seasonally decomposed",
+       caption = "Data: Australian Bureau of Statistics",
+       x = "",
+       y = "")
+
+# Let's look at everything in one plot instead
+ggplot(AU_births_decomposed_df, aes(x = yearmonth, y = as.numeric(value), group  = 1)) +
+  geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="trend"), color = "orange", size = 2) +
+  geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="observed"), color = "grey50") +
+  geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="seastrend"), color = "black") +
+  scale_x_discrete(breaks = filter(AU_births_decomposed_df, month == "01")$yearmonth,
+                   labels = filter(AU_births_decomposed_df, month == "01")$years) +
+  theme_economist_white() +
+  theme(axis.text.x = element_text(angle = 45)) +
+  labs(title = "Births in Australia",
+       subtitle = "Seasonally decomposed",
+       caption = "Data: Australian Bureau of Statistics",
+       x = "",
+       y = "")
+
+# TODO Horizon plot