# Analyse births in Australia library(tidyverse) library(ggthemes) library(forecast) AU_births <- read_csv("data/BIRTHS_MONTH_OCCURRENCE_17102017230358770.csv") # Exclude 2015 as data seems incomplete AU_births <- filter(AU_births, !(TIME == 2015)) # Remove subtotals in rows AU_births <- filter(AU_births, MONTH_OCCURENCE != "TOT") # Make it a number as it was meant to be AU_births$MONTH_OCCURENCE <- as.numeric(AU_births$MONTH_OCCURENCE) # Add median by month to df AU_births_medians <- AU_births %>% group_by(Region, MONTH_OCCURENCE) %>% summarise(median = median(Value)) AU_births <- merge(AU_births, AU_births_medians) # Create a seasonal subseries plot ggplot(filter(AU_births, Region == "Australia"), aes(x= Time, y = Value)) + geom_line() + geom_line(aes(y = median), color = "red") + facet_grid( ~ MONTH_OCCURENCE) + labs(x = paste("\nFrom", min(AU_births$Time), "to",max(AU_births$Time)), caption = "Data: Australian Bureau of Statistics", y = "Births per month", title = "Births in Australia", subtitle = "Bottoms out in summer months; boomed from 2005/2007") + theme_economist() + theme(axis.text.x = element_blank()) # Let's check the seasonality # Uses some base R graphics subtleties to right align the caption... AU_births_ts <- as.ts(arrange(filter(AU_births, Region == "Australia"), Time, MONTH_OCCURENCE)$Value) Acf(AU_births_ts, lag.max = 48, main = "") title(main = "ACF plot of births in Australia by month") title(sub = "Data: Australian Bureau of Statistics", adj = 1) # The acf peaks at multiples of 12, showing a yearly seasonality # Seasonality doesn't appear to increase over time #plot(AU_births_ts) #trend_AU_births <- ma(AU_births_ts, order = 12, centre = T) #plot(trend_AU_births) #detrend_AU_births <- AU_births_ts - trend_AU_births AU_births_decomposed <- decompose(ts(AU_births_ts, frequency = 12)) years <- rep(c(min(AU_births$TIME):max(AU_births$TIME)), each = 12) # For ggplot, we prefer a long dataframe over a list of timeseries AU_births_decomposed_df <- as.tibble(cbind(observation = c(1:length(years)), years, observed = as.vector(AU_births_ts), seasonal = as.vector(AU_births_decomposed$seasonal), random = as.vector(AU_births_decomposed$random), trend = as.vector(AU_births_decomposed_trend))) # Add a character vector only after the above cbind, to avoid everything becoming a chr or a factor AU_births_decomposed_df <- mutate(AU_births_decomposed_df, seastrend = seasonal + trend, month = rep(c("01","02","03","04","05","06","07","08","09","10","11","12"), times = 19)) AU_births_decomposed_df <- gather(AU_births_decomposed_df, key = component, value = value, -observation, -years, - month) # Plot the various components of the observed births underneath each other ggplot(filter(AU_births_decomposed_df, component != "seastrend"), aes(x = as.factor(observation), y = as.numeric(value), group = 1)) + geom_line() + facet_grid(component ~ ., scales = "free_y") + # Hackish way of showing only the year scale_x_discrete(breaks = filter(AU_births_decomposed_df, month == "01")$observation, labels = filter(AU_births_decomposed_df, month == "01")$years) + theme_fivethirtyeight() + theme(axis.text.x = element_text(angle = 45)) + labs(title = "Births in Australia", subtitle = "Seasonally decomposed", caption = "Data: Australian Bureau of Statistics", x = "", y = "") # Let's look at everything in one plot instead ggplot(AU_births_decomposed_df, aes(x = yearmonth, y = as.numeric(value), group = 1)) + geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="trend"), color = "orange", size = 2) + geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="observed"), color = "grey50") + geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="seastrend"), color = "black") + scale_x_discrete(breaks = filter(AU_births_decomposed_df, month == "01")$yearmonth, labels = filter(AU_births_decomposed_df, month == "01")$years) + theme_economist_white() + theme(axis.text.x = element_text(angle = 45)) + labs(title = "Births in Australia", subtitle = "Seasonally decomposed", caption = "Data: Australian Bureau of Statistics", x = "", y = "") # TODO Horizon plot