From f8ac8fb21d5bf5f1f8185909885fb907f60d3bac Mon Sep 17 00:00:00 2001 From: Frederik Vanrenterghem Date: Thu, 26 Oct 2017 10:54:16 +0800 Subject: [PATCH] Explore the data through visualising the seasonality. --- Australian-births.R | 92 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 Australian-births.R diff --git a/Australian-births.R b/Australian-births.R new file mode 100644 index 0000000..c810586 --- /dev/null +++ b/Australian-births.R @@ -0,0 +1,92 @@ +# Analyse births in Australia +library(tidyverse) +library(ggthemes) +library(forecast) +AU_births <- read_csv("data/BIRTHS_MONTH_OCCURRENCE_17102017230358770.csv") +# Exclude 2015 as data seems incomplete +AU_births <- filter(AU_births, !(TIME == 2015)) +# Remove subtotals in rows +AU_births <- filter(AU_births, MONTH_OCCURENCE != "TOT") +# Make it a number as it was meant to be +AU_births$MONTH_OCCURENCE <- as.numeric(AU_births$MONTH_OCCURENCE) +# Add median by month to df +AU_births_medians <- AU_births %>% + group_by(Region, MONTH_OCCURENCE) %>% + summarise(median = median(Value)) +AU_births <- merge(AU_births, AU_births_medians) + +# Create a seasonal subseries plot +ggplot(filter(AU_births, Region == "Australia"), aes(x= Time, y = Value)) + + geom_line() + + geom_line(aes(y = median), color = "red") + + facet_grid( ~ MONTH_OCCURENCE) + + labs(x = paste("\nFrom", min(AU_births$Time), "to",max(AU_births$Time)), + caption = "Data: Australian Bureau of Statistics", + y = "Births per month", + title = "Births in Australia", + subtitle = "Bottoms out in summer months; boomed from 2005/2007") + + theme_economist() + + theme(axis.text.x = element_blank()) + +# Let's check the seasonality +# Uses some base R graphics subtleties to right align the caption... +AU_births_ts <- as.ts(arrange(filter(AU_births, Region == "Australia"), Time, MONTH_OCCURENCE)$Value) +Acf(AU_births_ts, lag.max = 48, main = "") + title(main = "ACF plot of births in Australia by month") + title(sub = "Data: Australian Bureau of Statistics", adj = 1) + +# The acf peaks at multiples of 12, showing a yearly seasonality + +# Seasonality doesn't appear to increase over time +#plot(AU_births_ts) +#trend_AU_births <- ma(AU_births_ts, order = 12, centre = T) +#plot(trend_AU_births) +#detrend_AU_births <- AU_births_ts - trend_AU_births + +AU_births_decomposed <- decompose(ts(AU_births_ts, frequency = 12)) +years <- rep(c(min(AU_births$TIME):max(AU_births$TIME)), each = 12) + +# For ggplot, we prefer a long dataframe over a list of timeseries +AU_births_decomposed_df <- as.tibble(cbind(observation = c(1:length(years)), + years, + observed = as.vector(AU_births_ts), + seasonal = as.vector(AU_births_decomposed$seasonal), + random = as.vector(AU_births_decomposed$random), + trend = as.vector(AU_births_decomposed_trend))) +# Add a character vector only after the above cbind, to avoid everything becoming a chr or a factor +AU_births_decomposed_df <- mutate(AU_births_decomposed_df, + seastrend = seasonal + trend, + month = rep(c("01","02","03","04","05","06","07","08","09","10","11","12"), times = 19)) +AU_births_decomposed_df <- gather(AU_births_decomposed_df, key = component, value = value, -observation, -years, - month) + +# Plot the various components of the observed births underneath each other +ggplot(filter(AU_births_decomposed_df, component != "seastrend"), aes(x = as.factor(observation), y = as.numeric(value), group = 1)) + + geom_line() + + facet_grid(component ~ ., scales = "free_y") + + # Hackish way of showing only the year + scale_x_discrete(breaks = filter(AU_births_decomposed_df, month == "01")$observation, + labels = filter(AU_births_decomposed_df, month == "01")$years) + + theme_fivethirtyeight() + + theme(axis.text.x = element_text(angle = 45)) + + labs(title = "Births in Australia", + subtitle = "Seasonally decomposed", + caption = "Data: Australian Bureau of Statistics", + x = "", + y = "") + +# Let's look at everything in one plot instead +ggplot(AU_births_decomposed_df, aes(x = yearmonth, y = as.numeric(value), group = 1)) + + geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="trend"), color = "orange", size = 2) + + geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="observed"), color = "grey50") + + geom_line(data = dplyr::filter(AU_births_decomposed_df, component =="seastrend"), color = "black") + + scale_x_discrete(breaks = filter(AU_births_decomposed_df, month == "01")$yearmonth, + labels = filter(AU_births_decomposed_df, month == "01")$years) + + theme_economist_white() + + theme(axis.text.x = element_text(angle = 45)) + + labs(title = "Births in Australia", + subtitle = "Seasonally decomposed", + caption = "Data: Australian Bureau of Statistics", + x = "", + y = "") + +# TODO Horizon plot -- 2.30.2