library(tidyverse)
# sometimes stats::filter is used instead of dplyr::filter
# so we explictly define which `filter` here
filter <- dplyr::filter
A <- c("F_300_399_55_64_yrs",
"F_1_149_35_44_yrs",
"F_150_299_25_34_yrs",
"M_800_999_75_84_yrs",
"M_400_499_65_74_yrs")
B <- c(A,
"F_1_149_15_19_yrs",
"F_150_299_85ov",
"M_1500_1749_85ov",
"M_Neg_Nil_income_20_24_yrs",
"M_150_299_25_34_yrs")
str_split(A, pattern = "_")
## [[1]]
## [1] "F" "300" "399" "55" "64" "yrs"
##
## [[2]]
## [1] "F" "1" "149" "35" "44" "yrs"
##
## [[3]]
## [1] "F" "150" "299" "25" "34" "yrs"
##
## [[4]]
## [1] "M" "800" "999" "75" "84" "yrs"
##
## [[5]]
## [1] "M" "400" "499" "65" "74" "yrs"
# Not exactly what we want
str_split(B, pattern = "_")
## [[1]]
## [1] "F" "300" "399" "55" "64" "yrs"
##
## [[2]]
## [1] "F" "1" "149" "35" "44" "yrs"
##
## [[3]]
## [1] "F" "150" "299" "25" "34" "yrs"
##
## [[4]]
## [1] "M" "800" "999" "75" "84" "yrs"
##
## [[5]]
## [1] "M" "400" "499" "65" "74" "yrs"
##
## [[6]]
## [1] "F" "1" "149" "15" "19" "yrs"
##
## [[7]]
## [1] "F" "150" "299" "85ov"
##
## [[8]]
## [1] "M" "1500" "1749" "85ov"
##
## [[9]]
## [1] "M" "Neg" "Nil" "income" "20" "24" "yrs"
##
## [[10]]
## [1] "M" "150" "299" "25" "34" "yrs"
# Massage the string into the right format first
B <- str_replace(B, "Neg_Nil", "-Inf_0")
B <- str_replace(B, "ov", "_110_yrs")
B <- str_remove(B, "_income")
# Now it's ready to split
str_split(B, pattern = "_")
## [[1]]
## [1] "F" "300" "399" "55" "64" "yrs"
##
## [[2]]
## [1] "F" "1" "149" "35" "44" "yrs"
##
## [[3]]
## [1] "F" "150" "299" "25" "34" "yrs"
##
## [[4]]
## [1] "M" "800" "999" "75" "84" "yrs"
##
## [[5]]
## [1] "M" "400" "499" "65" "74" "yrs"
##
## [[6]]
## [1] "F" "1" "149" "15" "19" "yrs"
##
## [[7]]
## [1] "F" "150" "299" "85" "110" "yrs"
##
## [[8]]
## [1] "M" "1500" "1749" "85" "110" "yrs"
##
## [[9]]
## [1] "M" "-Inf" "0" "20" "24" "yrs"
##
## [[10]]
## [1] "M" "150" "299" "25" "34" "yrs"
# ADVANCED SOLUTION:
# There are other ways to do this with regular expressions
data.frame(A = A) %>%
extract(A, c("sex", "income_min", "income_max", "age_min", "age_max"), "([FM])_(\\d+)_(\\d+)_(\\d+)_(\\d+)_yrs")
## sex income_min income_max age_min age_max
## 1 F 300 399 55 64
## 2 F 1 149 35 44
## 3 F 150 299 25 34
## 4 M 800 999 75 84
## 5 M 400 499 65 74
data.frame(B = B) %>%
mutate(B = str_replace(B, "Neg_Nil_income", "0_0"),
B = str_replace(B, "85ov", "85_110_yrs")) %>%
extract(B, c("sex", "income_min", "income_max", "age_min", "age_max"), "([FM])_(\\d+)_(\\d+)_(\\d+)_(\\d+)_yrs")
## sex income_min income_max age_min age_max
## 1 F 300 399 55 64
## 2 F 1 149 35 44
## 3 F 150 299 25 34
## 4 M 800 999 75 84
## 5 M 400 499 65 74
## 6 F 1 149 15 19
## 7 F 150 299 85 110
## 8 M 1500 1749 85 110
## 9 <NA> <NA> <NA> <NA> <NA>
## 10 M 150 299 25 34
# Set the paths to the data
# Remeber to use here::here you must define a project first!
census_path <- here::here("data/2021_datapackage_VIC/")
SA1_paths <- glue::glue(census_path, "2021Census_G17{alpha}_VIC_SA1.csv",
geo = "SA1", alpha = c("A", "B", "C"))
STE_paths <- glue::glue(census_path, "2021Census_G17{alpha}_VIC_STE.csv",
geo = "SA1", alpha = c("A", "B", "C"))
data_paths = STE_paths
# Read in each of the three tables
tbl_G17A <- read_csv(data_paths[1])
tbl_G17B <- read_csv(data_paths[2])
tbl_G17C <- read_csv(data_paths[3])
# Have a look at what variables are stored in each table
names(tbl_G17A)
## [1] "STE_CODE_2021" "M_Neg_Nil_income_15_19_yrs"
## [3] "M_Neg_Nil_income_20_24_yrs" "M_Neg_Nil_income_25_34_yrs"
## [5] "M_Neg_Nil_income_35_44_yrs" "M_Neg_Nil_income_45_54_yrs"
## [7] "M_Neg_Nil_income_55_64_yrs" "M_Neg_Nil_income_65_74_yrs"
## [9] "M_Neg_Nil_income_75_84_yrs" "M_Negtve_Nil_incme_85_yrs_ovr"
## [11] "M_Neg_Nil_income_Tot" "M_1_149_15_19_yrs"
## [13] "M_1_149_20_24_yrs" "M_1_149_25_34_yrs"
## [15] "M_1_149_35_44_yrs" "M_1_149_45_54_yrs"
## [17] "M_1_149_55_64_yrs" "M_1_149_65_74_yrs"
## [19] "M_1_149_75_84_yrs" "M_1_149_85ov"
## [21] "M_1_149_Tot" "M_150_299_15_19_yrs"
## [23] "M_150_299_20_24_yrs" "M_150_299_25_34_yrs"
## [25] "M_150_299_35_44_yrs" "M_150_299_45_54_yrs"
## [27] "M_150_299_55_64_yrs" "M_150_299_65_74_yrs"
## [29] "M_150_299_75_84_yrs" "M_150_299_85ov"
## [31] "M_150_299_Tot" "M_300_399_15_19_yrs"
## [33] "M_300_399_20_24_yrs" "M_300_399_25_34_yrs"
## [35] "M_300_399_35_44_yrs" "M_300_399_45_54_yrs"
## [37] "M_300_399_55_64_yrs" "M_300_399_65_74_yrs"
## [39] "M_300_399_75_84_yrs" "M_300_399_85ov"
## [41] "M_300_399_Tot" "M_400_499_15_19_yrs"
## [43] "M_400_499_20_24_yrs" "M_400_499_25_34_yrs"
## [45] "M_400_499_35_44_yrs" "M_400_499_45_54_yrs"
## [47] "M_400_499_55_64_yrs" "M_400_499_65_74_yrs"
## [49] "M_400_499_75_84_yrs" "M_400_499_85ov"
## [51] "M_400_499_Tot" "M_500_649_15_19_yrs"
## [53] "M_500_649_20_24_yrs" "M_500_649_25_34_yrs"
## [55] "M_500_649_35_44_yrs" "M_500_649_45_54_yrs"
## [57] "M_500_649_55_64_yrs" "M_500_649_65_74_yrs"
## [59] "M_500_649_75_84_yrs" "M_500_649_85ov"
## [61] "M_500_649_Tot" "M_650_799_15_19_yrs"
## [63] "M_650_799_20_24_yrs" "M_650_799_25_34_yrs"
## [65] "M_650_799_35_44_yrs" "M_650_799_45_54_yrs"
## [67] "M_650_799_55_64_yrs" "M_650_799_65_74_yrs"
## [69] "M_650_799_75_84_yrs" "M_650_799_85ov"
## [71] "M_650_799_Tot" "M_800_999_15_19_yrs"
## [73] "M_800_999_20_24_yrs" "M_800_999_25_34_yrs"
## [75] "M_800_999_35_44_yrs" "M_800_999_45_54_yrs"
## [77] "M_800_999_55_64_yrs" "M_800_999_65_74_yrs"
## [79] "M_800_999_75_84_yrs" "M_800_999_85ov"
## [81] "M_800_999_Tot" "M_1000_1249_15_19_yrs"
## [83] "M_1000_1249_20_24_yrs" "M_1000_1249_25_34_yrs"
## [85] "M_1000_1249_35_44_yrs" "M_1000_1249_45_54_yrs"
## [87] "M_1000_1249_55_64_yrs" "M_1000_1249_65_74_yrs"
## [89] "M_1000_1249_75_84_yrs" "M_1000_1249_85ov"
## [91] "M_1000_1249_Tot" "M_1250_1499_15_19_yrs"
## [93] "M_1250_1499_20_24_yrs" "M_1250_1499_25_34_yrs"
## [95] "M_1250_1499_35_44_yrs" "M_1250_1499_45_54_yrs"
## [97] "M_1250_1499_55_64_yrs" "M_1250_1499_65_74_yrs"
## [99] "M_1250_1499_75_84_yrs" "M_1250_1499_85ov"
## [101] "M_1250_1499_Tot" "M_1500_1749_15_19_yrs"
## [103] "M_1500_1749_20_24_yrs" "M_1500_1749_25_34_yrs"
## [105] "M_1500_1749_35_44_yrs" "M_1500_1749_45_54_yrs"
## [107] "M_1500_1749_55_64_yrs" "M_1500_1749_65_74_yrs"
## [109] "M_1500_1749_75_84_yrs" "M_1500_1749_85ov"
## [111] "M_1500_1749_Tot" "M_1750_1999_15_19_yrs"
## [113] "M_1750_1999_20_24_yrs" "M_1750_1999_25_34_yrs"
## [115] "M_1750_1999_35_44_yrs" "M_1750_1999_45_54_yrs"
## [117] "M_1750_1999_55_64_yrs" "M_1750_1999_65_74_yrs"
## [119] "M_1750_1999_75_84_yrs" "M_1750_1999_85ov"
## [121] "M_1750_1999_Tot" "M_2000_2999_15_19_yrs"
## [123] "M_2000_2999_20_24_yrs" "M_2000_2999_25_34_yrs"
## [125] "M_2000_2999_35_44_yrs" "M_2000_2999_45_54_yrs"
## [127] "M_2000_2999_55_64_yrs" "M_2000_2999_65_74_yrs"
## [129] "M_2000_2999_75_84_yrs" "M_2000_2999_85ov"
## [131] "M_2000_2999_Tot" "M_3000_3499_15_19_yrs"
## [133] "M_3000_3499_20_24_yrs" "M_3000_3499_25_34_yrs"
## [135] "M_3000_3499_35_44_yrs" "M_3000_3499_45_54_yrs"
## [137] "M_3000_3499_55_64_yrs" "M_3000_3499_65_74_yrs"
## [139] "M_3000_3499_75_84_yrs" "M_3000_3499_85ov"
## [141] "M_3000_3499_Tot" "M_3500_more_15_19_yrs"
## [143] "M_3500_more_20_24_yrs" "M_3500_more_25_34_yrs"
## [145] "M_3500_more_35_44_yrs" "M_3500_more_45_54_yrs"
## [147] "M_3500_more_55_64_yrs" "M_3500_more_65_74_yrs"
## [149] "M_3500_more_75_84_yrs" "M_3500_more_85ov"
## [151] "M_3500_more_Tot" "M_PI_NS_15_19_yrs"
## [153] "M_PI_NS_ns_20_24_yrs" "M_PI_NS_ns_25_34_yrs"
## [155] "M_PI_NS_ns_35_44_yrs" "M_PI_NS_ns_45_54_yrs"
## [157] "M_PI_NS_ns_55_64_yrs" "M_PI_NS_ns_65_74_yrs"
## [159] "M_PI_NS_ns_75_84_yrs" "M_PI_NS_ns_85_yrs_ovr"
## [161] "M_PI_NS_ns_Tot" "M_Tot_15_19_yrs"
## [163] "M_Tot_20_24_yrs" "M_Tot_25_34_yrs"
## [165] "M_Tot_35_44_yrs" "M_Tot_45_54_yrs"
## [167] "M_Tot_55_64_yrs" "M_Tot_65_74_yrs"
## [169] "M_Tot_75_84_yrs" "M_Tot_85ov"
## [171] "M_Tot_Tot" "F_Neg_Nil_income_15_19_yrs"
## [173] "F_Neg_Nil_income_20_24_yrs" "F_Neg_Nil_income_25_34_yrs"
## [175] "F_Neg_Nil_income_35_44_yrs" "F_Neg_Nil_income_45_54_yrs"
## [177] "F_Neg_Nil_income_55_64_yrs" "F_Neg_Nil_income_65_74_yrs"
## [179] "F_Neg_Nil_income_75_84_yrs" "F_Neg_Nil_incme_85_yrs_ovr"
## [181] "F_Neg_Nil_income_Tot" "F_1_149_15_19_yrs"
## [183] "F_1_149_20_24_yrs" "F_1_149_25_34_yrs"
## [185] "F_1_149_35_44_yrs" "F_1_149_45_54_yrs"
## [187] "F_1_149_55_64_yrs" "F_1_149_65_74_yrs"
## [189] "F_1_149_75_84_yrs" "F_1_149_85ov"
## [191] "F_1_149_Tot" "F_150_299_15_19_yrs"
## [193] "F_150_299_20_24_yrs" "F_150_299_25_34_yrs"
## [195] "F_150_299_35_44_yrs" "F_150_299_45_54_yrs"
## [197] "F_150_299_55_64_yrs" "F_150_299_65_74_yrs"
## [199] "F_150_299_75_84_yrs" "F_150_299_85ov"
## [201] "F_150_299_Tot"
names(tbl_G17B)
## [1] "STE_CODE_2021" "F_300_399_15_19_yrs"
## [3] "F_300_399_20_24_yrs" "F_300_399_25_34_yrs"
## [5] "F_300_399_35_44_yrs" "F_300_399_45_54_yrs"
## [7] "F_300_399_55_64_yrs" "F_300_399_65_74_yrs"
## [9] "F_300_399_75_84_yrs" "F_300_399_85ov"
## [11] "F_300_399_Tot" "F_400_499_15_19_yrs"
## [13] "F_400_499_20_24_yrs" "F_400_499_25_34_yrs"
## [15] "F_400_499_35_44_yrs" "F_400_499_45_54_yrs"
## [17] "F_400_499_55_64_yrs" "F_400_499_65_74_yrs"
## [19] "F_400_499_75_84_yrs" "F_400_499_85ov"
## [21] "F_400_499_Tot" "F_500_649_15_19_yrs"
## [23] "F_500_649_20_24_yrs" "F_500_649_25_34_yrs"
## [25] "F_500_649_35_44_yrs" "F_500_649_45_54_yrs"
## [27] "F_500_649_55_64_yrs" "F_500_649_65_74_yrs"
## [29] "F_500_649_75_84_yrs" "F_500_649_85ov"
## [31] "F_500_649_Tot" "F_650_799_15_19_yrs"
## [33] "F_650_799_20_24_yrs" "F_650_799_25_34_yrs"
## [35] "F_650_799_35_44_yrs" "F_650_799_45_54_yrs"
## [37] "F_650_799_55_64_yrs" "F_650_799_65_74_yrs"
## [39] "F_650_799_75_84_yrs" "F_650_799_85ov"
## [41] "F_650_799_Tot" "F_800_999_15_19_yrs"
## [43] "F_800_999_20_24_yrs" "F_800_999_25_34_yrs"
## [45] "F_800_999_35_44_yrs" "F_800_999_45_54_yrs"
## [47] "F_800_999_55_64_yrs" "F_800_999_65_74_yrs"
## [49] "F_800_999_75_84_yrs" "F_800_999_85ov"
## [51] "F_800_999_Tot" "F_1000_1249_15_19_yrs"
## [53] "F_1000_1249_20_24_yrs" "F_1000_1249_25_34_yrs"
## [55] "F_1000_1249_35_44_yrs" "F_1000_1249_45_54_yrs"
## [57] "F_1000_1249_55_64_yrs" "F_1000_1249_65_74_yrs"
## [59] "F_1000_1249_75_84_yrs" "F_1000_1249_85ov"
## [61] "F_1000_1249_Tot" "F_1250_1499_15_19_yrs"
## [63] "F_1250_1499_20_24_yrs" "F_1250_1499_25_34_yrs"
## [65] "F_1250_1499_35_44_yrs" "F_1250_1499_45_54_yrs"
## [67] "F_1250_1499_55_64_yrs" "F_1250_1499_65_74_yrs"
## [69] "F_1250_1499_75_84_yrs" "F_1250_1499_85ov"
## [71] "F_1250_1499_Tot" "F_1500_1749_15_19_yrs"
## [73] "F_1500_1749_20_24_yrs" "F_1500_1749_25_34_yrs"
## [75] "F_1500_1749_35_44_yrs" "F_1500_1749_45_54_yrs"
## [77] "F_1500_1749_55_64_yrs" "F_1500_1749_65_74_yrs"
## [79] "F_1500_1749_75_84_yrs" "F_1500_1749_85ov"
## [81] "F_1500_1749_Tot" "F_1750_1999_15_19_yrs"
## [83] "F_1750_1999_20_24_yrs" "F_1750_1999_25_34_yrs"
## [85] "F_1750_1999_35_44_yrs" "F_1750_1999_45_54_yrs"
## [87] "F_1750_1999_55_64_yrs" "F_1750_1999_65_74_yrs"
## [89] "F_1750_1999_75_84_yrs" "F_1750_1999_85ov"
## [91] "F_1750_1999_Tot" "F_2000_2999_15_19_yrs"
## [93] "F_2000_2999_20_24_yrs" "F_2000_2999_25_34_yrs"
## [95] "F_2000_2999_35_44_yrs" "F_2000_2999_45_54_yrs"
## [97] "F_2000_2999_55_64_yrs" "F_2000_2999_65_74_yrs"
## [99] "F_2000_2999_75_84_yrs" "F_2000_2999_85ov"
## [101] "F_2000_2999_Tot" "F_3000_3499_15_19_yrs"
## [103] "F_3000_3499_20_24_yrs" "F_3000_3499_25_34_yrs"
## [105] "F_3000_3499_35_44_yrs" "F_3000_3499_45_54_yrs"
## [107] "F_3000_3499_55_64_yrs" "F_3000_3499_65_74_yrs"
## [109] "F_3000_3499_75_84_yrs" "F_3000_3499_85ov"
## [111] "F_3000_3499_Tot" "F_3500_more_15_19_yrs"
## [113] "F_3500_more_20_24_yrs" "F_3500_more_25_34_yrs"
## [115] "F_3500_more_35_44_yrs" "F_3500_more_45_54_yrs"
## [117] "F_3500_more_55_64_yrs" "F_3500_more_65_74_yrs"
## [119] "F_3500_more_75_84_yrs" "F_3500_more_85ov"
## [121] "F_3500_more_Tot" "F_PI_NS_15_19_yrs"
## [123] "F_PI_NS_ns_20_24_yrs" "F_PI_NS_ns_25_34_yrs"
## [125] "F_PI_NS_ns_35_44_yrs" "F_PI_NS_ns_45_54_yrs"
## [127] "F_PI_NS_ns_55_64_yrs" "F_PI_NS_ns_65_74_yrs"
## [129] "F_PI_NS_ns_75_84_yrs" "F_PI_NS_ns_85_yrs_ovr"
## [131] "F_PI_NS_ns_Tot" "F_Tot_15_19_yrs"
## [133] "F_Tot_20_24_yrs" "F_Tot_25_34_yrs"
## [135] "F_Tot_35_44_yrs" "F_Tot_45_54_yrs"
## [137] "F_Tot_55_64_yrs" "F_Tot_65_74_yrs"
## [139] "F_Tot_75_84_yrs" "F_Tot_85ov"
## [141] "F_Tot_Tot" "P_Neg_Nil_income_15_19_yrs"
## [143] "P_Neg_Nil_income_20_24_yrs" "P_Neg_Nil_income_25_34_yrs"
## [145] "P_Neg_Nil_income_35_44_yrs" "P_Neg_Nil_income_45_54_yrs"
## [147] "P_Neg_Nil_income_55_64_yrs" "P_Neg_Nil_income_65_74_yrs"
## [149] "P_Neg_Nil_income_75_84_yrs" "P_Negtve_Nil_incme_85_yrs_ovr"
## [151] "P_Neg_Nil_income_Tot" "P_1_149_15_19_yrs"
## [153] "P_1_149_20_24_yrs" "P_1_149_25_34_yrs"
## [155] "P_1_149_35_44_yrs" "P_1_149_45_54_yrs"
## [157] "P_1_149_55_64_yrs" "P_1_149_65_74_yrs"
## [159] "P_1_149_75_84_yrs" "P_1_149_85ov"
## [161] "P_1_149_Tot" "P_150_299_15_19_yrs"
## [163] "P_150_299_20_24_yrs" "P_150_299_25_34_yrs"
## [165] "P_150_299_35_44_yrs" "P_150_299_45_54_yrs"
## [167] "P_150_299_55_64_yrs" "P_150_299_65_74_yrs"
## [169] "P_150_299_75_84_yrs" "P_150_299_85ov"
## [171] "P_150_299_Tot" "P_300_399_15_19_yrs"
## [173] "P_300_399_20_24_yrs" "P_300_399_25_34_yrs"
## [175] "P_300_399_35_44_yrs" "P_300_399_45_54_yrs"
## [177] "P_300_399_55_64_yrs" "P_300_399_65_74_yrs"
## [179] "P_300_399_75_84_yrs" "P_300_399_85ov"
## [181] "P_300_399_Tot" "P_400_499_15_19_yrs"
## [183] "P_400_499_20_24_yrs" "P_400_499_25_34_yrs"
## [185] "P_400_499_35_44_yrs" "P_400_499_45_54_yrs"
## [187] "P_400_499_55_64_yrs" "P_400_499_65_74_yrs"
## [189] "P_400_499_75_84_yrs" "P_400_499_85ov"
## [191] "P_400_499_Tot" "P_500_649_15_19_yrs"
## [193] "P_500_649_20_24_yrs" "P_500_649_25_34_yrs"
## [195] "P_500_649_35_44_yrs" "P_500_649_45_54_yrs"
## [197] "P_500_649_55_64_yrs" "P_500_649_65_74_yrs"
## [199] "P_500_649_75_84_yrs" "P_500_649_85ov"
## [201] "P_500_649_Tot"
names(tbl_G17C)
## [1] "STE_CODE_2021" "P_650_799_15_19_yrs" "P_650_799_20_24_yrs"
## [4] "P_650_799_25_34_yrs" "P_650_799_35_44_yrs" "P_650_799_45_54_yrs"
## [7] "P_650_799_55_64_yrs" "P_650_799_65_74_yrs" "P_650_799_75_84_yrs"
## [10] "P_650_799_85ov" "P_650_799_Tot" "P_800_999_15_19_yrs"
## [13] "P_800_999_20_24_yrs" "P_800_999_25_34_yrs" "P_800_999_35_44_yrs"
## [16] "P_800_999_45_54_yrs" "P_800_999_55_64_yrs" "P_800_999_65_74_yrs"
## [19] "P_800_999_75_84_yrs" "P_800_999_85ov" "P_800_999_Tot"
## [22] "P_1000_1249_15_19_yrs" "P_1000_1249_20_24_yrs" "P_1000_1249_25_34_yrs"
## [25] "P_1000_1249_35_44_yrs" "P_1000_1249_45_54_yrs" "P_1000_1249_55_64_yrs"
## [28] "P_1000_1249_65_74_yrs" "P_1000_1249_75_84_yrs" "P_1000_1249_85ov"
## [31] "P_1000_1249_Tot" "P_1250_1499_15_19_yrs" "P_1250_1499_20_24_yrs"
## [34] "P_1250_1499_25_34_yrs" "P_1250_1499_35_44_yrs" "P_1250_1499_45_54_yrs"
## [37] "P_1250_1499_55_64_yrs" "P_1250_1499_65_74_yrs" "P_1250_1499_75_84_yrs"
## [40] "P_1250_1499_85ov" "P_1250_1499_Tot" "P_1500_1749_15_19_yrs"
## [43] "P_1500_1749_20_24_yrs" "P_1500_1749_25_34_yrs" "P_1500_1749_35_44_yrs"
## [46] "P_1500_1749_45_54_yrs" "P_1500_1749_55_64_yrs" "P_1500_1749_65_74_yrs"
## [49] "P_1500_1749_75_84_yrs" "P_1500_1749_85ov" "P_1500_1749_Tot"
## [52] "P_1750_1999_15_19_yrs" "P_1750_1999_20_24_yrs" "P_1750_1999_25_34_yrs"
## [55] "P_1750_1999_35_44_yrs" "P_1750_1999_45_54_yrs" "P_1750_1999_55_64_yrs"
## [58] "P_1750_1999_65_74_yrs" "P_1750_1999_75_84_yrs" "P_1750_1999_85ov"
## [61] "P_1750_1999_Tot" "P_2000_2999_15_19_yrs" "P_2000_2999_20_24_yrs"
## [64] "P_2000_2999_25_34_yrs" "P_2000_2999_35_44_yrs" "P_2000_2999_45_54_yrs"
## [67] "P_2000_2999_55_64_yrs" "P_2000_2999_65_74_yrs" "P_2000_2999_75_84_yrs"
## [70] "P_2000_2999_85ov" "P_2000_2999_Tot" "P_3000_3499_15_19_yrs"
## [73] "P_3000_3499_20_24_yrs" "P_3000_3499_25_34_yrs" "P_3000_3499_35_44_yrs"
## [76] "P_3000_3499_45_54_yrs" "P_3000_3499_55_64_yrs" "P_3000_3499_65_74_yrs"
## [79] "P_3000_3499_75_84_yrs" "P_3000_3499_85ov" "P_3000_3499_Tot"
## [82] "P_3500_more_15_19_yrs" "P_3500_more_20_24_yrs" "P_3500_more_25_34_yrs"
## [85] "P_3500_more_35_44_yrs" "P_3500_more_45_54_yrs" "P_3500_more_55_64_yrs"
## [88] "P_3500_more_65_74_yrs" "P_3500_more_75_84_yrs" "P_3500_more_85ov"
## [91] "P_3500_more_Tot" "P_PI_NS_15_19_yrs" "P_PI_NS_ns_20_24_yrs"
## [94] "P_PI_NS_ns_25_34_yrs" "P_PI_NS_ns_35_44_yrs" "P_PI_NS_ns_45_54_yrs"
## [97] "P_PI_NS_ns_55_64_yrs" "P_PI_NS_ns_65_74_yrs" "P_PI_NS_ns_75_84_yrs"
## [100] "P_PI_NS_ns_85_yrs_ovr" "P_PI_NS_ns_Tot" "P_Tot_15_19_yrs"
## [103] "P_Tot_20_24_yrs" "P_Tot_25_34_yrs" "P_Tot_35_44_yrs"
## [106] "P_Tot_45_54_yrs" "P_Tot_55_64_yrs" "P_Tot_65_74_yrs"
## [109] "P_Tot_75_84_yrs" "P_Tot_85ov" "P_Tot_Tot"
# Combine all the data together
tbl_G17 <- bind_rows(tbl_G17A, tbl_G17B, tbl_G17C)
dim(tbl_G17) # gives dimensions of the new dataset
## [1] 3 511
# Change the format of the table to make it longer instead of wider
# This is a step closer to a tidy format
tbl_G17_long <- tbl_G17 |>
pivot_longer(cols = -1, names_to = "category",
values_to = "count")
View(tbl_G17_long)
# We want to split the strings using the "_"
# But there are multiple different cases to consider
# There are at least 5 cases we'll need to code for
underscore_count_per_category = str_count(
string = tbl_G17_long$category, pattern = "_")
table(underscore_count_per_category)
## underscore_count_per_category
## 2 3 4 5 6
## 18 252 90 1017 153
# What are the weird cases?
# Can look in the meta data / names / to help identify
# 1) Neg_Nil_income --> change to -Inf_0
# 1*) Negtve_Nil_income --> change to -Inf_0
# 2) more --> Inf
# 3) PI_NS --> NA_NA
# 4) 85ov --> 85_110
# 4*) 85_yrs_ov --> 85_110
# Use this code to explore the different sub cases we are going to need to code for
pattern_val = "ov" #Neg, Negtve, more, PI, ov, 85ov, 85_yrs
View(tbl_G17_long |> filter(str_detect(category, pattern_val)))
# Lots can go wrong in string matching
# You need to be very very precise in what you ask for
tbl_G17_long_formatted <- tbl_G17_long |>
filter(!str_detect(string = category, pattern = "Tot"),
!str_detect(category, "PI_NS")) |>
mutate(
category = str_replace(category, "Neg_Nil_income", "-Inf_0"),
category = str_replace(category, "Neg_Nil_incme", "-Inf_0"),
category = str_replace(category, "Negtve_Nil_incme", "-Inf_0"),
category = str_replace(category, "more", "Inf"),
category = str_replace(category, "85ov", "85_110_yrs"),
category = str_replace(category, "85_yrs_ovr", "85_110_yrs"))
# seems like they are all have the same number of underscores now
underscore_count_per_category = str_count(tbl_G17_long_formatted$category,
pattern = "_")
table(underscore_count_per_category)
## underscore_count_per_category
## 5
## 1215
# The data can be converted to the tidy format
tbl_G17_tidy <- tbl_G17_long_formatted |>
mutate(category = str_remove(category, "_yrs")) |>
separate_wider_delim(cols = category, delim = "_",
names = c("sex", "income_min", "income_max", "age_min", "age_max")) |>
unite("income", c(income_min, income_max), remove = FALSE) |>
unite("age", c(age_min, age_max), remove = FALSE)
View(tbl_G17_tidy)
tbl_G17_tidy_STE = tbl_G17_tidy
str_remove
call to get rid of
_yrs
otherwise we would end up with an extra column we
don’t need.To repeat this for the SA1 regions, you just need to change the following line of code.
data_paths = SA1_paths ## This line here to set the right path
tbl_G17A <- read_csv(data_paths[1])
tbl_G17B <- read_csv(data_paths[2])
tbl_G17C <- read_csv(data_paths[3])
tbl_G17 <- bind_rows(tbl_G17A, tbl_G17B, tbl_G17C)
tbl_G17_long <- tbl_G17 |>
pivot_longer(cols = -1, names_to = "category",
values_to = "count")
### WARNING: This takes a long time - there is a lot of data!
tbl_G17_long_formatted <- tbl_G17_long |>
filter(!str_detect(string = category, pattern = "Tot"),
!str_detect(category, "PI_NS")) |>
mutate(
category = str_replace(category, "Neg_Nil_income", "-Inf_0"),
category = str_replace(category, "Neg_Nil_incme", "-Inf_0"),
category = str_replace(category, "Negtve_Nil_incme", "-Inf_0"),
category = str_replace(category, "more", "Inf"),
category = str_replace(category, "85ov", "85_110_yrs"),
category = str_replace(category, "85_yrs_ovr", "85_110_yrs"))
tbl_G17_tidy <- tbl_G17_long_formatted |>
mutate(category = str_remove(category, "_yrs")) |>
separate_wider_delim(cols = category, delim = "_",
names = c("sex", "income_min", "income_max", "age_min", "age_max")) |>
unite("income", c(income_min, income_max), remove = FALSE) |>
unite("age", c(age_min, age_max), remove = FALSE)
tbl_G17_tidy_SA1 = tbl_G17_tidy
As you get more advanced in coding, you can learn to wrap all this code in a function so you don’t need to copy and paste the same code from above.
vic_pop_sizes_STE <- tbl_G17_tidy_STE |>
filter(sex == "P") |>
pull(count)
vic_pop_sizes_SA1 <- tbl_G17_tidy_STE |>
filter(sex == "P") |>
pull(count)
total_vic_pop_sizes = data.frame(
STE = vic_pop_sizes_STE |> sum(na.rm = TRUE) ,
SA1 = vic_pop_sizes_SA1 |> sum(na.rm = TRUE))
If we use the STE
data, we have 4.973795^{6} people
over 15 years old but in SA1
data, we have
4.973795^{6}. The difference of 0 is 0, but you will find differences if
you repeat this analysis for 2016. It is actual quite common to find
small differences between totals for different regions. This can likely
attributed to the small random adjustments to the counts (for
confidentiality). In particular, SA1 represents a smaller region and
thus small numbers are likely to identify particular individuals. It is
not surprising then that there will be more adjustments made to SA1
data. The STE data is aggregated at state level so it would more
accurately reflect the true number of people over 15 years old. This
does not reflect the total population in Victoria, however, as it does
not account for those under 15 years old. The population size by age in
Victoria from 2021 census can be found here.
The minimum and maximum values of count
is 66 and
1.63348^{5} (for STE, or for SA1 66 and 1.63348^{5}).
Before drawing the boxplots, we’ll just wrangle the data to remove the redundant rows and make labels that are more pretty for the graph. You could also consider merging the 15-19 and 20-24 years old together so that the range is the same as other categories (except the one over 85 years old). The code and output are all shown below. There are a number of things you may notice from the graphs, such as, there are more females than males in almost all age groups in Victoria; higher income earners are still male dominant (even in younger age groups); females do appear to live longer.
tbl_G17_tidy = tbl_G17_tidy_STE
# For plotting (so the labels appear in the right order on the axis)
# Try with and without this line to spot the difference
tbl_G17_tidy$income <- fct_reorder(tbl_G17_tidy$income,
as.numeric(tbl_G17_tidy$income_min))
tbl_G17_tidy |>
filter(sex != "P") |>
group_by(sex) |>
summarise(count = sum(count, na.rm = TRUE)) |>
ggplot() +
geom_col(aes(x = sex, y = count)) +
ggtitle("1. Sex distribution in Victoria from 2021 Census") +
theme_bw(base_size = 12)
tbl_G17_tidy |>
group_by(age) |>
summarise(count = sum(count, na.rm = TRUE)) |>
ggplot() +
geom_col(aes(x = age, y = count)) +
ggtitle("1. Age distribution in Victoria from 2021 Census") +
theme_bw(base_size = 12)
tbl_G17_tidy %>%
group_by(income) %>%
summarise(count = sum(count, na.rm = TRUE)) %>%
ggplot() +
geom_col(aes(x = income, y = count)) +
ggtitle("3. Income distribution in Victoria from 2021 Census") +
theme_bw(base_size = 12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.3))
tbl_G17_tidy %>%
filter(sex != "P") |>
group_by(sex, age) %>%
summarise(count = sum(count, na.rm = TRUE)) %>%
ggplot() +
geom_col(aes(x = age, y = count, fill = sex), position = "dodge") +
ggtitle("4. Sex & age distribution in Victoria from 2021 Census") +
theme_bw(base_size = 12)
tbl_G17_tidy %>%
filter(sex != "P") |>
group_by(sex, income) %>%
summarise(count = sum(count, na.rm = TRUE)) %>%
ggplot() +
geom_col(aes(x = income, y = count, fill = sex), position = "dodge") +
ggtitle("5. Sex & income distribution in Victoria from 2016 Census") +
theme_bw(base_size = 12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.3))
tbl_G17_tidy %>%
group_by(age, income) %>%
summarise(count = sum(count, na.rm = TRUE)) %>%
ggplot() +
geom_col(aes(x = age, y = count), position = "dodge") +
facet_wrap(~income) +
ggtitle("6. Age & income distribution in Victoria from 2021 Census") +
theme_bw(base_size = 12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.3))
tbl_G17_tidy %>%
filter(sex != "P") |>
ggplot(aes(x = age, y = count, fill = sex)) +
geom_col(position = "dodge") +
facet_wrap(~income) +
theme_bw(base_size = 12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.3)) +
ggtitle("7. Income, sex & age distribution in Victoria from 2021 Census")
We will use the STE data to extract the relevant statistics.
n_women_15_54 <- tbl_G17_tidy_STE %>%
filter(age_min >=15 & age_max <= 54 & sex == "F") %>%
pull(count) %>%
sum(na.rm = TRUE)
n_people_25_34_earn_1750_or_more <- tbl_G17_tidy_STE %>%
filter(age_min >=25 & age_max <= 34 & sex == "P" & income_min >= 1750) %>%
pull(count) %>%
sum(na.rm = TRUE)
n_man_25_44 <- tbl_G17_tidy_STE %>%
filter(age_min >=25 & age_max <= 44 & sex == "M") %>%
pull(count) %>%
sum(na.rm = TRUE)
n_man_25_44_earn_1500_or_less <- tbl_G17_tidy_STE %>%
filter(age_min >=25 & age_max <= 44 & sex == "M" & income_max <= 1500) %>%
pull(count) %>%
sum(na.rm = TRUE)
n_vic = total_vic_pop_sizes$STE
According to the 2021 Census data: