library(tidyverse)
# sometimes stats::filter is used instead of dplyr::filter
# so we explictly define which `filter` here
filter <- dplyr::filter

🏋 Exercise 4A

A <- c("F_300_399_55_64_yrs", 
       "F_1_149_35_44_yrs", 
       "F_150_299_25_34_yrs", 
       "M_800_999_75_84_yrs", 
       "M_400_499_65_74_yrs")

B <- c(A, 
       "F_1_149_15_19_yrs", 
       "F_150_299_85ov", 
       "M_1500_1749_85ov", 
       "M_Neg_Nil_income_20_24_yrs", 
       "M_150_299_25_34_yrs")

str_split(A, pattern = "_")
## [[1]]
## [1] "F"   "300" "399" "55"  "64"  "yrs"
## 
## [[2]]
## [1] "F"   "1"   "149" "35"  "44"  "yrs"
## 
## [[3]]
## [1] "F"   "150" "299" "25"  "34"  "yrs"
## 
## [[4]]
## [1] "M"   "800" "999" "75"  "84"  "yrs"
## 
## [[5]]
## [1] "M"   "400" "499" "65"  "74"  "yrs"
# Not exactly what we want
str_split(B, pattern = "_")
## [[1]]
## [1] "F"   "300" "399" "55"  "64"  "yrs"
## 
## [[2]]
## [1] "F"   "1"   "149" "35"  "44"  "yrs"
## 
## [[3]]
## [1] "F"   "150" "299" "25"  "34"  "yrs"
## 
## [[4]]
## [1] "M"   "800" "999" "75"  "84"  "yrs"
## 
## [[5]]
## [1] "M"   "400" "499" "65"  "74"  "yrs"
## 
## [[6]]
## [1] "F"   "1"   "149" "15"  "19"  "yrs"
## 
## [[7]]
## [1] "F"    "150"  "299"  "85ov"
## 
## [[8]]
## [1] "M"    "1500" "1749" "85ov"
## 
## [[9]]
## [1] "M"      "Neg"    "Nil"    "income" "20"     "24"     "yrs"   
## 
## [[10]]
## [1] "M"   "150" "299" "25"  "34"  "yrs"
# Massage the string into the right format first
B <- str_replace(B, "Neg_Nil", "-Inf_0")
B <- str_replace(B, "ov", "_110_yrs")
B <- str_remove(B, "_income")

# Now it's ready to split
str_split(B, pattern = "_")
## [[1]]
## [1] "F"   "300" "399" "55"  "64"  "yrs"
## 
## [[2]]
## [1] "F"   "1"   "149" "35"  "44"  "yrs"
## 
## [[3]]
## [1] "F"   "150" "299" "25"  "34"  "yrs"
## 
## [[4]]
## [1] "M"   "800" "999" "75"  "84"  "yrs"
## 
## [[5]]
## [1] "M"   "400" "499" "65"  "74"  "yrs"
## 
## [[6]]
## [1] "F"   "1"   "149" "15"  "19"  "yrs"
## 
## [[7]]
## [1] "F"   "150" "299" "85"  "110" "yrs"
## 
## [[8]]
## [1] "M"    "1500" "1749" "85"   "110"  "yrs" 
## 
## [[9]]
## [1] "M"    "-Inf" "0"    "20"   "24"   "yrs" 
## 
## [[10]]
## [1] "M"   "150" "299" "25"  "34"  "yrs"
# ADVANCED SOLUTION: 
# There are other ways to do this with regular expressions

data.frame(A = A) %>% 
   extract(A, c("sex", "income_min", "income_max", "age_min", "age_max"), "([FM])_(\\d+)_(\\d+)_(\\d+)_(\\d+)_yrs")
##   sex income_min income_max age_min age_max
## 1   F        300        399      55      64
## 2   F          1        149      35      44
## 3   F        150        299      25      34
## 4   M        800        999      75      84
## 5   M        400        499      65      74
data.frame(B = B) %>% 
   mutate(B = str_replace(B, "Neg_Nil_income", "0_0"),
          B = str_replace(B, "85ov", "85_110_yrs")) %>% 
   extract(B, c("sex", "income_min", "income_max", "age_min", "age_max"), "([FM])_(\\d+)_(\\d+)_(\\d+)_(\\d+)_yrs")
##     sex income_min income_max age_min age_max
## 1     F        300        399      55      64
## 2     F          1        149      35      44
## 3     F        150        299      25      34
## 4     M        800        999      75      84
## 5     M        400        499      65      74
## 6     F          1        149      15      19
## 7     F        150        299      85     110
## 8     M       1500       1749      85     110
## 9  <NA>       <NA>       <NA>    <NA>    <NA>
## 10    M        150        299      25      34

🏋 Exercise 4B

# Set the paths to the data 
# Remeber to use here::here you must define a project first! 
census_path <- here::here("data/2021_datapackage_VIC/")

SA1_paths <- glue::glue(census_path, "2021Census_G17{alpha}_VIC_SA1.csv",
                        geo = "SA1", alpha = c("A", "B", "C"))

STE_paths <- glue::glue(census_path, "2021Census_G17{alpha}_VIC_STE.csv",
                        geo = "SA1", alpha = c("A", "B", "C"))
data_paths = STE_paths

# Read in each of the three tables
tbl_G17A <- read_csv(data_paths[1])
tbl_G17B <- read_csv(data_paths[2])
tbl_G17C <- read_csv(data_paths[3])

# Have a look at what variables are stored in each table
names(tbl_G17A)
##   [1] "STE_CODE_2021"                 "M_Neg_Nil_income_15_19_yrs"   
##   [3] "M_Neg_Nil_income_20_24_yrs"    "M_Neg_Nil_income_25_34_yrs"   
##   [5] "M_Neg_Nil_income_35_44_yrs"    "M_Neg_Nil_income_45_54_yrs"   
##   [7] "M_Neg_Nil_income_55_64_yrs"    "M_Neg_Nil_income_65_74_yrs"   
##   [9] "M_Neg_Nil_income_75_84_yrs"    "M_Negtve_Nil_incme_85_yrs_ovr"
##  [11] "M_Neg_Nil_income_Tot"          "M_1_149_15_19_yrs"            
##  [13] "M_1_149_20_24_yrs"             "M_1_149_25_34_yrs"            
##  [15] "M_1_149_35_44_yrs"             "M_1_149_45_54_yrs"            
##  [17] "M_1_149_55_64_yrs"             "M_1_149_65_74_yrs"            
##  [19] "M_1_149_75_84_yrs"             "M_1_149_85ov"                 
##  [21] "M_1_149_Tot"                   "M_150_299_15_19_yrs"          
##  [23] "M_150_299_20_24_yrs"           "M_150_299_25_34_yrs"          
##  [25] "M_150_299_35_44_yrs"           "M_150_299_45_54_yrs"          
##  [27] "M_150_299_55_64_yrs"           "M_150_299_65_74_yrs"          
##  [29] "M_150_299_75_84_yrs"           "M_150_299_85ov"               
##  [31] "M_150_299_Tot"                 "M_300_399_15_19_yrs"          
##  [33] "M_300_399_20_24_yrs"           "M_300_399_25_34_yrs"          
##  [35] "M_300_399_35_44_yrs"           "M_300_399_45_54_yrs"          
##  [37] "M_300_399_55_64_yrs"           "M_300_399_65_74_yrs"          
##  [39] "M_300_399_75_84_yrs"           "M_300_399_85ov"               
##  [41] "M_300_399_Tot"                 "M_400_499_15_19_yrs"          
##  [43] "M_400_499_20_24_yrs"           "M_400_499_25_34_yrs"          
##  [45] "M_400_499_35_44_yrs"           "M_400_499_45_54_yrs"          
##  [47] "M_400_499_55_64_yrs"           "M_400_499_65_74_yrs"          
##  [49] "M_400_499_75_84_yrs"           "M_400_499_85ov"               
##  [51] "M_400_499_Tot"                 "M_500_649_15_19_yrs"          
##  [53] "M_500_649_20_24_yrs"           "M_500_649_25_34_yrs"          
##  [55] "M_500_649_35_44_yrs"           "M_500_649_45_54_yrs"          
##  [57] "M_500_649_55_64_yrs"           "M_500_649_65_74_yrs"          
##  [59] "M_500_649_75_84_yrs"           "M_500_649_85ov"               
##  [61] "M_500_649_Tot"                 "M_650_799_15_19_yrs"          
##  [63] "M_650_799_20_24_yrs"           "M_650_799_25_34_yrs"          
##  [65] "M_650_799_35_44_yrs"           "M_650_799_45_54_yrs"          
##  [67] "M_650_799_55_64_yrs"           "M_650_799_65_74_yrs"          
##  [69] "M_650_799_75_84_yrs"           "M_650_799_85ov"               
##  [71] "M_650_799_Tot"                 "M_800_999_15_19_yrs"          
##  [73] "M_800_999_20_24_yrs"           "M_800_999_25_34_yrs"          
##  [75] "M_800_999_35_44_yrs"           "M_800_999_45_54_yrs"          
##  [77] "M_800_999_55_64_yrs"           "M_800_999_65_74_yrs"          
##  [79] "M_800_999_75_84_yrs"           "M_800_999_85ov"               
##  [81] "M_800_999_Tot"                 "M_1000_1249_15_19_yrs"        
##  [83] "M_1000_1249_20_24_yrs"         "M_1000_1249_25_34_yrs"        
##  [85] "M_1000_1249_35_44_yrs"         "M_1000_1249_45_54_yrs"        
##  [87] "M_1000_1249_55_64_yrs"         "M_1000_1249_65_74_yrs"        
##  [89] "M_1000_1249_75_84_yrs"         "M_1000_1249_85ov"             
##  [91] "M_1000_1249_Tot"               "M_1250_1499_15_19_yrs"        
##  [93] "M_1250_1499_20_24_yrs"         "M_1250_1499_25_34_yrs"        
##  [95] "M_1250_1499_35_44_yrs"         "M_1250_1499_45_54_yrs"        
##  [97] "M_1250_1499_55_64_yrs"         "M_1250_1499_65_74_yrs"        
##  [99] "M_1250_1499_75_84_yrs"         "M_1250_1499_85ov"             
## [101] "M_1250_1499_Tot"               "M_1500_1749_15_19_yrs"        
## [103] "M_1500_1749_20_24_yrs"         "M_1500_1749_25_34_yrs"        
## [105] "M_1500_1749_35_44_yrs"         "M_1500_1749_45_54_yrs"        
## [107] "M_1500_1749_55_64_yrs"         "M_1500_1749_65_74_yrs"        
## [109] "M_1500_1749_75_84_yrs"         "M_1500_1749_85ov"             
## [111] "M_1500_1749_Tot"               "M_1750_1999_15_19_yrs"        
## [113] "M_1750_1999_20_24_yrs"         "M_1750_1999_25_34_yrs"        
## [115] "M_1750_1999_35_44_yrs"         "M_1750_1999_45_54_yrs"        
## [117] "M_1750_1999_55_64_yrs"         "M_1750_1999_65_74_yrs"        
## [119] "M_1750_1999_75_84_yrs"         "M_1750_1999_85ov"             
## [121] "M_1750_1999_Tot"               "M_2000_2999_15_19_yrs"        
## [123] "M_2000_2999_20_24_yrs"         "M_2000_2999_25_34_yrs"        
## [125] "M_2000_2999_35_44_yrs"         "M_2000_2999_45_54_yrs"        
## [127] "M_2000_2999_55_64_yrs"         "M_2000_2999_65_74_yrs"        
## [129] "M_2000_2999_75_84_yrs"         "M_2000_2999_85ov"             
## [131] "M_2000_2999_Tot"               "M_3000_3499_15_19_yrs"        
## [133] "M_3000_3499_20_24_yrs"         "M_3000_3499_25_34_yrs"        
## [135] "M_3000_3499_35_44_yrs"         "M_3000_3499_45_54_yrs"        
## [137] "M_3000_3499_55_64_yrs"         "M_3000_3499_65_74_yrs"        
## [139] "M_3000_3499_75_84_yrs"         "M_3000_3499_85ov"             
## [141] "M_3000_3499_Tot"               "M_3500_more_15_19_yrs"        
## [143] "M_3500_more_20_24_yrs"         "M_3500_more_25_34_yrs"        
## [145] "M_3500_more_35_44_yrs"         "M_3500_more_45_54_yrs"        
## [147] "M_3500_more_55_64_yrs"         "M_3500_more_65_74_yrs"        
## [149] "M_3500_more_75_84_yrs"         "M_3500_more_85ov"             
## [151] "M_3500_more_Tot"               "M_PI_NS_15_19_yrs"            
## [153] "M_PI_NS_ns_20_24_yrs"          "M_PI_NS_ns_25_34_yrs"         
## [155] "M_PI_NS_ns_35_44_yrs"          "M_PI_NS_ns_45_54_yrs"         
## [157] "M_PI_NS_ns_55_64_yrs"          "M_PI_NS_ns_65_74_yrs"         
## [159] "M_PI_NS_ns_75_84_yrs"          "M_PI_NS_ns_85_yrs_ovr"        
## [161] "M_PI_NS_ns_Tot"                "M_Tot_15_19_yrs"              
## [163] "M_Tot_20_24_yrs"               "M_Tot_25_34_yrs"              
## [165] "M_Tot_35_44_yrs"               "M_Tot_45_54_yrs"              
## [167] "M_Tot_55_64_yrs"               "M_Tot_65_74_yrs"              
## [169] "M_Tot_75_84_yrs"               "M_Tot_85ov"                   
## [171] "M_Tot_Tot"                     "F_Neg_Nil_income_15_19_yrs"   
## [173] "F_Neg_Nil_income_20_24_yrs"    "F_Neg_Nil_income_25_34_yrs"   
## [175] "F_Neg_Nil_income_35_44_yrs"    "F_Neg_Nil_income_45_54_yrs"   
## [177] "F_Neg_Nil_income_55_64_yrs"    "F_Neg_Nil_income_65_74_yrs"   
## [179] "F_Neg_Nil_income_75_84_yrs"    "F_Neg_Nil_incme_85_yrs_ovr"   
## [181] "F_Neg_Nil_income_Tot"          "F_1_149_15_19_yrs"            
## [183] "F_1_149_20_24_yrs"             "F_1_149_25_34_yrs"            
## [185] "F_1_149_35_44_yrs"             "F_1_149_45_54_yrs"            
## [187] "F_1_149_55_64_yrs"             "F_1_149_65_74_yrs"            
## [189] "F_1_149_75_84_yrs"             "F_1_149_85ov"                 
## [191] "F_1_149_Tot"                   "F_150_299_15_19_yrs"          
## [193] "F_150_299_20_24_yrs"           "F_150_299_25_34_yrs"          
## [195] "F_150_299_35_44_yrs"           "F_150_299_45_54_yrs"          
## [197] "F_150_299_55_64_yrs"           "F_150_299_65_74_yrs"          
## [199] "F_150_299_75_84_yrs"           "F_150_299_85ov"               
## [201] "F_150_299_Tot"
names(tbl_G17B)
##   [1] "STE_CODE_2021"                 "F_300_399_15_19_yrs"          
##   [3] "F_300_399_20_24_yrs"           "F_300_399_25_34_yrs"          
##   [5] "F_300_399_35_44_yrs"           "F_300_399_45_54_yrs"          
##   [7] "F_300_399_55_64_yrs"           "F_300_399_65_74_yrs"          
##   [9] "F_300_399_75_84_yrs"           "F_300_399_85ov"               
##  [11] "F_300_399_Tot"                 "F_400_499_15_19_yrs"          
##  [13] "F_400_499_20_24_yrs"           "F_400_499_25_34_yrs"          
##  [15] "F_400_499_35_44_yrs"           "F_400_499_45_54_yrs"          
##  [17] "F_400_499_55_64_yrs"           "F_400_499_65_74_yrs"          
##  [19] "F_400_499_75_84_yrs"           "F_400_499_85ov"               
##  [21] "F_400_499_Tot"                 "F_500_649_15_19_yrs"          
##  [23] "F_500_649_20_24_yrs"           "F_500_649_25_34_yrs"          
##  [25] "F_500_649_35_44_yrs"           "F_500_649_45_54_yrs"          
##  [27] "F_500_649_55_64_yrs"           "F_500_649_65_74_yrs"          
##  [29] "F_500_649_75_84_yrs"           "F_500_649_85ov"               
##  [31] "F_500_649_Tot"                 "F_650_799_15_19_yrs"          
##  [33] "F_650_799_20_24_yrs"           "F_650_799_25_34_yrs"          
##  [35] "F_650_799_35_44_yrs"           "F_650_799_45_54_yrs"          
##  [37] "F_650_799_55_64_yrs"           "F_650_799_65_74_yrs"          
##  [39] "F_650_799_75_84_yrs"           "F_650_799_85ov"               
##  [41] "F_650_799_Tot"                 "F_800_999_15_19_yrs"          
##  [43] "F_800_999_20_24_yrs"           "F_800_999_25_34_yrs"          
##  [45] "F_800_999_35_44_yrs"           "F_800_999_45_54_yrs"          
##  [47] "F_800_999_55_64_yrs"           "F_800_999_65_74_yrs"          
##  [49] "F_800_999_75_84_yrs"           "F_800_999_85ov"               
##  [51] "F_800_999_Tot"                 "F_1000_1249_15_19_yrs"        
##  [53] "F_1000_1249_20_24_yrs"         "F_1000_1249_25_34_yrs"        
##  [55] "F_1000_1249_35_44_yrs"         "F_1000_1249_45_54_yrs"        
##  [57] "F_1000_1249_55_64_yrs"         "F_1000_1249_65_74_yrs"        
##  [59] "F_1000_1249_75_84_yrs"         "F_1000_1249_85ov"             
##  [61] "F_1000_1249_Tot"               "F_1250_1499_15_19_yrs"        
##  [63] "F_1250_1499_20_24_yrs"         "F_1250_1499_25_34_yrs"        
##  [65] "F_1250_1499_35_44_yrs"         "F_1250_1499_45_54_yrs"        
##  [67] "F_1250_1499_55_64_yrs"         "F_1250_1499_65_74_yrs"        
##  [69] "F_1250_1499_75_84_yrs"         "F_1250_1499_85ov"             
##  [71] "F_1250_1499_Tot"               "F_1500_1749_15_19_yrs"        
##  [73] "F_1500_1749_20_24_yrs"         "F_1500_1749_25_34_yrs"        
##  [75] "F_1500_1749_35_44_yrs"         "F_1500_1749_45_54_yrs"        
##  [77] "F_1500_1749_55_64_yrs"         "F_1500_1749_65_74_yrs"        
##  [79] "F_1500_1749_75_84_yrs"         "F_1500_1749_85ov"             
##  [81] "F_1500_1749_Tot"               "F_1750_1999_15_19_yrs"        
##  [83] "F_1750_1999_20_24_yrs"         "F_1750_1999_25_34_yrs"        
##  [85] "F_1750_1999_35_44_yrs"         "F_1750_1999_45_54_yrs"        
##  [87] "F_1750_1999_55_64_yrs"         "F_1750_1999_65_74_yrs"        
##  [89] "F_1750_1999_75_84_yrs"         "F_1750_1999_85ov"             
##  [91] "F_1750_1999_Tot"               "F_2000_2999_15_19_yrs"        
##  [93] "F_2000_2999_20_24_yrs"         "F_2000_2999_25_34_yrs"        
##  [95] "F_2000_2999_35_44_yrs"         "F_2000_2999_45_54_yrs"        
##  [97] "F_2000_2999_55_64_yrs"         "F_2000_2999_65_74_yrs"        
##  [99] "F_2000_2999_75_84_yrs"         "F_2000_2999_85ov"             
## [101] "F_2000_2999_Tot"               "F_3000_3499_15_19_yrs"        
## [103] "F_3000_3499_20_24_yrs"         "F_3000_3499_25_34_yrs"        
## [105] "F_3000_3499_35_44_yrs"         "F_3000_3499_45_54_yrs"        
## [107] "F_3000_3499_55_64_yrs"         "F_3000_3499_65_74_yrs"        
## [109] "F_3000_3499_75_84_yrs"         "F_3000_3499_85ov"             
## [111] "F_3000_3499_Tot"               "F_3500_more_15_19_yrs"        
## [113] "F_3500_more_20_24_yrs"         "F_3500_more_25_34_yrs"        
## [115] "F_3500_more_35_44_yrs"         "F_3500_more_45_54_yrs"        
## [117] "F_3500_more_55_64_yrs"         "F_3500_more_65_74_yrs"        
## [119] "F_3500_more_75_84_yrs"         "F_3500_more_85ov"             
## [121] "F_3500_more_Tot"               "F_PI_NS_15_19_yrs"            
## [123] "F_PI_NS_ns_20_24_yrs"          "F_PI_NS_ns_25_34_yrs"         
## [125] "F_PI_NS_ns_35_44_yrs"          "F_PI_NS_ns_45_54_yrs"         
## [127] "F_PI_NS_ns_55_64_yrs"          "F_PI_NS_ns_65_74_yrs"         
## [129] "F_PI_NS_ns_75_84_yrs"          "F_PI_NS_ns_85_yrs_ovr"        
## [131] "F_PI_NS_ns_Tot"                "F_Tot_15_19_yrs"              
## [133] "F_Tot_20_24_yrs"               "F_Tot_25_34_yrs"              
## [135] "F_Tot_35_44_yrs"               "F_Tot_45_54_yrs"              
## [137] "F_Tot_55_64_yrs"               "F_Tot_65_74_yrs"              
## [139] "F_Tot_75_84_yrs"               "F_Tot_85ov"                   
## [141] "F_Tot_Tot"                     "P_Neg_Nil_income_15_19_yrs"   
## [143] "P_Neg_Nil_income_20_24_yrs"    "P_Neg_Nil_income_25_34_yrs"   
## [145] "P_Neg_Nil_income_35_44_yrs"    "P_Neg_Nil_income_45_54_yrs"   
## [147] "P_Neg_Nil_income_55_64_yrs"    "P_Neg_Nil_income_65_74_yrs"   
## [149] "P_Neg_Nil_income_75_84_yrs"    "P_Negtve_Nil_incme_85_yrs_ovr"
## [151] "P_Neg_Nil_income_Tot"          "P_1_149_15_19_yrs"            
## [153] "P_1_149_20_24_yrs"             "P_1_149_25_34_yrs"            
## [155] "P_1_149_35_44_yrs"             "P_1_149_45_54_yrs"            
## [157] "P_1_149_55_64_yrs"             "P_1_149_65_74_yrs"            
## [159] "P_1_149_75_84_yrs"             "P_1_149_85ov"                 
## [161] "P_1_149_Tot"                   "P_150_299_15_19_yrs"          
## [163] "P_150_299_20_24_yrs"           "P_150_299_25_34_yrs"          
## [165] "P_150_299_35_44_yrs"           "P_150_299_45_54_yrs"          
## [167] "P_150_299_55_64_yrs"           "P_150_299_65_74_yrs"          
## [169] "P_150_299_75_84_yrs"           "P_150_299_85ov"               
## [171] "P_150_299_Tot"                 "P_300_399_15_19_yrs"          
## [173] "P_300_399_20_24_yrs"           "P_300_399_25_34_yrs"          
## [175] "P_300_399_35_44_yrs"           "P_300_399_45_54_yrs"          
## [177] "P_300_399_55_64_yrs"           "P_300_399_65_74_yrs"          
## [179] "P_300_399_75_84_yrs"           "P_300_399_85ov"               
## [181] "P_300_399_Tot"                 "P_400_499_15_19_yrs"          
## [183] "P_400_499_20_24_yrs"           "P_400_499_25_34_yrs"          
## [185] "P_400_499_35_44_yrs"           "P_400_499_45_54_yrs"          
## [187] "P_400_499_55_64_yrs"           "P_400_499_65_74_yrs"          
## [189] "P_400_499_75_84_yrs"           "P_400_499_85ov"               
## [191] "P_400_499_Tot"                 "P_500_649_15_19_yrs"          
## [193] "P_500_649_20_24_yrs"           "P_500_649_25_34_yrs"          
## [195] "P_500_649_35_44_yrs"           "P_500_649_45_54_yrs"          
## [197] "P_500_649_55_64_yrs"           "P_500_649_65_74_yrs"          
## [199] "P_500_649_75_84_yrs"           "P_500_649_85ov"               
## [201] "P_500_649_Tot"
names(tbl_G17C)
##   [1] "STE_CODE_2021"         "P_650_799_15_19_yrs"   "P_650_799_20_24_yrs"  
##   [4] "P_650_799_25_34_yrs"   "P_650_799_35_44_yrs"   "P_650_799_45_54_yrs"  
##   [7] "P_650_799_55_64_yrs"   "P_650_799_65_74_yrs"   "P_650_799_75_84_yrs"  
##  [10] "P_650_799_85ov"        "P_650_799_Tot"         "P_800_999_15_19_yrs"  
##  [13] "P_800_999_20_24_yrs"   "P_800_999_25_34_yrs"   "P_800_999_35_44_yrs"  
##  [16] "P_800_999_45_54_yrs"   "P_800_999_55_64_yrs"   "P_800_999_65_74_yrs"  
##  [19] "P_800_999_75_84_yrs"   "P_800_999_85ov"        "P_800_999_Tot"        
##  [22] "P_1000_1249_15_19_yrs" "P_1000_1249_20_24_yrs" "P_1000_1249_25_34_yrs"
##  [25] "P_1000_1249_35_44_yrs" "P_1000_1249_45_54_yrs" "P_1000_1249_55_64_yrs"
##  [28] "P_1000_1249_65_74_yrs" "P_1000_1249_75_84_yrs" "P_1000_1249_85ov"     
##  [31] "P_1000_1249_Tot"       "P_1250_1499_15_19_yrs" "P_1250_1499_20_24_yrs"
##  [34] "P_1250_1499_25_34_yrs" "P_1250_1499_35_44_yrs" "P_1250_1499_45_54_yrs"
##  [37] "P_1250_1499_55_64_yrs" "P_1250_1499_65_74_yrs" "P_1250_1499_75_84_yrs"
##  [40] "P_1250_1499_85ov"      "P_1250_1499_Tot"       "P_1500_1749_15_19_yrs"
##  [43] "P_1500_1749_20_24_yrs" "P_1500_1749_25_34_yrs" "P_1500_1749_35_44_yrs"
##  [46] "P_1500_1749_45_54_yrs" "P_1500_1749_55_64_yrs" "P_1500_1749_65_74_yrs"
##  [49] "P_1500_1749_75_84_yrs" "P_1500_1749_85ov"      "P_1500_1749_Tot"      
##  [52] "P_1750_1999_15_19_yrs" "P_1750_1999_20_24_yrs" "P_1750_1999_25_34_yrs"
##  [55] "P_1750_1999_35_44_yrs" "P_1750_1999_45_54_yrs" "P_1750_1999_55_64_yrs"
##  [58] "P_1750_1999_65_74_yrs" "P_1750_1999_75_84_yrs" "P_1750_1999_85ov"     
##  [61] "P_1750_1999_Tot"       "P_2000_2999_15_19_yrs" "P_2000_2999_20_24_yrs"
##  [64] "P_2000_2999_25_34_yrs" "P_2000_2999_35_44_yrs" "P_2000_2999_45_54_yrs"
##  [67] "P_2000_2999_55_64_yrs" "P_2000_2999_65_74_yrs" "P_2000_2999_75_84_yrs"
##  [70] "P_2000_2999_85ov"      "P_2000_2999_Tot"       "P_3000_3499_15_19_yrs"
##  [73] "P_3000_3499_20_24_yrs" "P_3000_3499_25_34_yrs" "P_3000_3499_35_44_yrs"
##  [76] "P_3000_3499_45_54_yrs" "P_3000_3499_55_64_yrs" "P_3000_3499_65_74_yrs"
##  [79] "P_3000_3499_75_84_yrs" "P_3000_3499_85ov"      "P_3000_3499_Tot"      
##  [82] "P_3500_more_15_19_yrs" "P_3500_more_20_24_yrs" "P_3500_more_25_34_yrs"
##  [85] "P_3500_more_35_44_yrs" "P_3500_more_45_54_yrs" "P_3500_more_55_64_yrs"
##  [88] "P_3500_more_65_74_yrs" "P_3500_more_75_84_yrs" "P_3500_more_85ov"     
##  [91] "P_3500_more_Tot"       "P_PI_NS_15_19_yrs"     "P_PI_NS_ns_20_24_yrs" 
##  [94] "P_PI_NS_ns_25_34_yrs"  "P_PI_NS_ns_35_44_yrs"  "P_PI_NS_ns_45_54_yrs" 
##  [97] "P_PI_NS_ns_55_64_yrs"  "P_PI_NS_ns_65_74_yrs"  "P_PI_NS_ns_75_84_yrs" 
## [100] "P_PI_NS_ns_85_yrs_ovr" "P_PI_NS_ns_Tot"        "P_Tot_15_19_yrs"      
## [103] "P_Tot_20_24_yrs"       "P_Tot_25_34_yrs"       "P_Tot_35_44_yrs"      
## [106] "P_Tot_45_54_yrs"       "P_Tot_55_64_yrs"       "P_Tot_65_74_yrs"      
## [109] "P_Tot_75_84_yrs"       "P_Tot_85ov"            "P_Tot_Tot"
# Combine all the data together
tbl_G17 <- bind_rows(tbl_G17A, tbl_G17B, tbl_G17C)
dim(tbl_G17) # gives dimensions of the new dataset
## [1]   3 511
# Change the format of the table to make it longer instead of wider
# This is a step closer to a tidy format
tbl_G17_long <- tbl_G17 |>
  pivot_longer(cols = -1, names_to = "category",
               values_to = "count")
View(tbl_G17_long)

# We want to split the strings using the "_"
# But there are multiple different cases to consider
# There are at least 5 cases we'll need to code for
underscore_count_per_category = str_count(
  string = tbl_G17_long$category, pattern = "_")
table(underscore_count_per_category)
## underscore_count_per_category
##    2    3    4    5    6 
##   18  252   90 1017  153
# What are the weird cases?
# Can look in the meta data / names / to help identify
# 1) Neg_Nil_income --> change to -Inf_0
# 1*) Negtve_Nil_income --> change to -Inf_0
# 2) more --> Inf
# 3) PI_NS --> NA_NA
# 4) 85ov --> 85_110
# 4*) 85_yrs_ov --> 85_110

# Use this code to explore the different sub cases we are going to need to code for
pattern_val = "ov" #Neg, Negtve, more, PI, ov, 85ov, 85_yrs
View(tbl_G17_long |> filter(str_detect(category, pattern_val)))

# Lots can go wrong in string matching
# You need to be very very precise in what you ask for

tbl_G17_long_formatted <- tbl_G17_long |>
  filter(!str_detect(string = category, pattern = "Tot"),
         !str_detect(category, "PI_NS")) |>
  mutate(
    category = str_replace(category, "Neg_Nil_income", "-Inf_0"),
    category = str_replace(category, "Neg_Nil_incme", "-Inf_0"),
    category = str_replace(category, "Negtve_Nil_incme", "-Inf_0"),
    category = str_replace(category, "more", "Inf"),
    category = str_replace(category, "85ov", "85_110_yrs"),
    category = str_replace(category, "85_yrs_ovr", "85_110_yrs"))

# seems like they are all have the same number of underscores now
underscore_count_per_category = str_count(tbl_G17_long_formatted$category,
                                          pattern = "_")
table(underscore_count_per_category)
## underscore_count_per_category
##    5 
## 1215
# The data can be converted to the tidy format
tbl_G17_tidy <- tbl_G17_long_formatted |>
  mutate(category = str_remove(category, "_yrs")) |>
  separate_wider_delim(cols = category, delim = "_",
                       names = c("sex", "income_min", "income_max", "age_min", "age_max")) |>
  unite("income", c(income_min, income_max), remove = FALSE) |>
  unite("age", c(age_min, age_max), remove = FALSE)
View(tbl_G17_tidy)

tbl_G17_tidy_STE = tbl_G17_tidy

To repeat this for the SA1 regions, you just need to change the following line of code.

data_paths = SA1_paths ## This line here to set the right path

tbl_G17A <- read_csv(data_paths[1])
tbl_G17B <- read_csv(data_paths[2])
tbl_G17C <- read_csv(data_paths[3])

tbl_G17 <- bind_rows(tbl_G17A, tbl_G17B, tbl_G17C)

tbl_G17_long <- tbl_G17 |>
  pivot_longer(cols = -1, names_to = "category",
               values_to = "count")

### WARNING: This takes a long time - there is a lot of data!
tbl_G17_long_formatted <- tbl_G17_long |>
  filter(!str_detect(string = category, pattern = "Tot"),
         !str_detect(category, "PI_NS")) |>
  mutate(
    category = str_replace(category, "Neg_Nil_income", "-Inf_0"),
    category = str_replace(category, "Neg_Nil_incme", "-Inf_0"),
    category = str_replace(category, "Negtve_Nil_incme", "-Inf_0"),
    category = str_replace(category, "more", "Inf"),
    category = str_replace(category, "85ov", "85_110_yrs"),
    category = str_replace(category, "85_yrs_ovr", "85_110_yrs"))

tbl_G17_tidy <- tbl_G17_long_formatted |>
  mutate(category = str_remove(category, "_yrs")) |>
  separate_wider_delim(cols = category, delim = "_",
                       names = c("sex", "income_min", "income_max", "age_min", "age_max")) |>
  unite("income", c(income_min, income_max), remove = FALSE) |>
  unite("age", c(age_min, age_max), remove = FALSE)

tbl_G17_tidy_SA1 = tbl_G17_tidy

As you get more advanced in coding, you can learn to wrap all this code in a function so you don’t need to copy and paste the same code from above.

🏃 Exercise 4C

vic_pop_sizes_STE <- tbl_G17_tidy_STE |>
  filter(sex == "P") |>
  pull(count)  

vic_pop_sizes_SA1 <- tbl_G17_tidy_STE |>
  filter(sex == "P") |>
  pull(count) 

total_vic_pop_sizes = data.frame(
  STE = vic_pop_sizes_STE |> sum(na.rm = TRUE) , 
  SA1 = vic_pop_sizes_SA1 |> sum(na.rm = TRUE))
tbl_G17_tidy = tbl_G17_tidy_STE 

# For plotting (so the labels appear in the right order on the axis)
# Try with and without this line to spot the difference

tbl_G17_tidy$income <- fct_reorder(tbl_G17_tidy$income,
                                   as.numeric(tbl_G17_tidy$income_min))

tbl_G17_tidy |>
  filter(sex != "P") |>
  group_by(sex) |>
  summarise(count = sum(count, na.rm = TRUE)) |>
  ggplot() +
  geom_col(aes(x = sex, y = count)) +
  ggtitle("1. Sex distribution in Victoria from 2021 Census") +
  theme_bw(base_size = 12)

tbl_G17_tidy |>
  group_by(age) |>
  summarise(count = sum(count, na.rm = TRUE)) |>
  ggplot() +
  geom_col(aes(x = age, y = count)) +
  ggtitle("1. Age distribution in Victoria from 2021 Census") +
  theme_bw(base_size = 12)

tbl_G17_tidy %>%
  group_by(income) %>%
  summarise(count = sum(count, na.rm = TRUE)) %>%
  ggplot() +
  geom_col(aes(x = income, y = count)) +
  ggtitle("3. Income distribution in Victoria from 2021 Census") +
  theme_bw(base_size = 12) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.3))

tbl_G17_tidy %>%
  filter(sex != "P") |>
  group_by(sex, age) %>%
  summarise(count = sum(count, na.rm = TRUE)) %>%
  ggplot() +
  geom_col(aes(x = age, y = count, fill = sex), position = "dodge") +
  ggtitle("4. Sex & age distribution in Victoria from 2021 Census") +
  theme_bw(base_size = 12)

tbl_G17_tidy %>%
  filter(sex != "P") |>
  group_by(sex, income) %>%
  summarise(count = sum(count, na.rm = TRUE)) %>%
  ggplot() +
  geom_col(aes(x = income, y = count, fill = sex), position = "dodge") +
  ggtitle("5. Sex & income distribution in Victoria from 2016 Census") +
  theme_bw(base_size = 12) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.3))

tbl_G17_tidy %>%
  group_by(age, income) %>%
  summarise(count = sum(count, na.rm = TRUE)) %>%
  ggplot() +
  geom_col(aes(x = age, y = count), position = "dodge") +
  facet_wrap(~income) +
  ggtitle("6. Age & income distribution in Victoria from 2021 Census") +
  theme_bw(base_size = 12) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.3))

tbl_G17_tidy %>%
  filter(sex != "P") |>
  ggplot(aes(x = age, y = count, fill = sex)) +
  geom_col(position = "dodge") +
  facet_wrap(~income) +
  theme_bw(base_size = 12) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.3)) +
  ggtitle("7. Income, sex & age distribution in Victoria from 2021 Census")

💃 Exercise 4D

We will use the STE data to extract the relevant statistics.

n_women_15_54 <- tbl_G17_tidy_STE %>% 
   filter(age_min >=15 & age_max <= 54 & sex == "F") %>% 
   pull(count) %>% 
   sum(na.rm = TRUE)

n_people_25_34_earn_1750_or_more <-  tbl_G17_tidy_STE %>% 
   filter(age_min >=25 & age_max <= 34 & sex == "P" & income_min >= 1750) %>% 
   pull(count) %>% 
   sum(na.rm = TRUE)

n_man_25_44 <-  tbl_G17_tidy_STE %>% 
   filter(age_min >=25 & age_max <= 44 & sex == "M") %>% 
   pull(count) %>% 
   sum(na.rm = TRUE)

n_man_25_44_earn_1500_or_less <-  tbl_G17_tidy_STE %>% 
   filter(age_min >=25 & age_max <= 44 & sex == "M" & income_max <= 1500) %>% 
   pull(count) %>% 
   sum(na.rm = TRUE)

n_vic = total_vic_pop_sizes$STE

According to the 2021 Census data:

Material originally developed by Dr. Emi Tanaka and maintained by Dr. Kate Saunders