🎯 Objectives

In this tutorial, you will learn to

🔧 Preparation

Note: you should do preparation before the tutorial.

  1. Installing relevant R-packages
### Required packages
list.of.packages <- c("MASS", "data.table", "tidyverse", "here", "stringr",
                      "lubridate", "ggplot2", "usmap", "gganimate", "ltm")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
invisible(lapply(list.of.packages, require, character.only = TRUE))
  1. Unzip the ETC5512-week07.zip file provided, and notice the file structure for this week looks like:
etc5512-week07
├── tutorial-07.Rmd
├── final data
│   ├──  2016_2022_data.rds
├── raw data
├── prep
│   ├──  00_read_data.R
│   ├──  01_import_data.R
│   ├──  02_combine.R
└── etc5512-week07.Rproj
  1. Import the data by loading the R datafile (.rds).

Note: This file has been pre-processed for you to work on the exercises in this tutorial.

This data contains the Fannie Mae Single Home Loan data from year 2016Q1 to 2022Q3.

alldata <- readRDS(here::here("final data", "2016_22_data.rds"))

🕰️ Exercise 7A

Analyzing Data

  1. Create a table of default rate (based on 60, 90, 120 and 180 days of delinquency) by origination year and quarter. You might want to convert the ORIG_DTE to a date format first. Do you observe something peculiar from the table? Explain why.
dim(alldata)
## [1] 20059771       75
# Familiarize yourself with the variables
colnames(alldata)
##  [1] "LOAN_ID"            "ORIG_CHN"           "SELLER"            
##  [4] "loan_age"           "orig_rt"            "orig_amt"          
##  [7] "orig_trm"           "oltv"               "ocltv"             
## [10] "num_bo"             "dti"                "CSCORE_B"          
## [13] "FTHB_FLG"           "purpose"            "PROP_TYP"          
## [16] "NUM_UNIT"           "occ_stat"           "state"             
## [19] "zip_3"              "mi_pct"             "CSCORE_C"          
## [22] "relo_flg"           "MI_TYPE"            "AQSN_DTE"          
## [25] "ORIG_DTE"           "FRST_DTE"           "LAST_RT"           
## [28] "LAST_UPB"           "msa"                "FCC_COST"          
## [31] "PP_COST"            "AR_COST"            "IE_COST"           
## [34] "TAX_COST"           "NS_PROCS"           "CE_PROCS"          
## [37] "RMW_PROCS"          "O_PROCS"            "repch_flag"        
## [40] "LAST_ACTIVITY_DATE" "LPI_DTE"            "FCC_DTE"           
## [43] "DISP_DTE"           "SERVICER"           "F30_DTE"           
## [46] "F60_DTE"            "F90_DTE"            "F120_DTE"          
## [49] "F180_DTE"           "FCE_DTE"            "F180_UPB"          
## [52] "FCE_UPB"            "F30_UPB"            "F60_UPB"           
## [55] "F90_UPB"            "MOD_FLAG"           "FMOD_DTE"          
## [58] "FMOD_UPB"           "MODIR_COST"         "MODFB_COST"        
## [61] "MODFG_COST"         "MODTRM_CHNG"        "MODUPB_CHNG"       
## [64] "z_num_periods_120"  "F120_UPB"           "CSCORE_MN"         
## [67] "ORIG_VAL"           "LAST_DTE"           "LAST_STAT"         
## [70] "COMPLT_FLG"         "INT_COST"           "PFG_COST"          
## [73] "NET_LOSS"           "NET_SEV"            "MODTOT_COST"
#Convert orig_dte to date format
alldata <- alldata |> 
  mutate(orig_yq = quarter(ymd(ORIG_DTE), type = "year.quarter"),
         orig_yr = year(ymd(ORIG_DTE)))

# default rate by origination year quarter
alldata |> 
  group_by(orig_yq) |> 
  summarise(D180 = sum(!is.na(F180_DTE))/length(F180_DTE),
            D120 = sum(!is.na(F120_DTE))/length(F120_DTE),
            D90 = sum(!is.na(F90_DTE))/length(F90_DTE),
            D60 = sum(!is.na(F60_DTE))/length(F60_DTE)) -> delin.table

delin.table
## # A tibble: 42 × 5
##    orig_yq   D180   D120    D90    D60
##      <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1   2010. 0      0      0      0     
##  2   2011. 0      0      0      0     
##  3   2011. 0      0      0      0     
##  4   2012. 0      0      0      0     
##  5   2012. 0      0      0      0     
##  6   2013. 0      0      0      0     
##  7   2013. 0      0      0      0     
##  8   2014. 0      0      0      0     
##  9   2014. 0.0345 0.0345 0.0345 0.0345
## 10   2014. 0      0.0244 0.0488 0.0488
## # … with 32 more rows
delin.table |> 
  pivot_longer(cols = starts_with("D"),
               names_to = "Delinquency Days",
               values_to = "Rate") |> 
  filter(orig_yq >= 2016.1) |> 
  ggplot(aes(x = orig_yq,
             y = Rate,
             colour = `Delinquency Days` )) +
  geom_line() +
  labs(
    title = "Fannie Mae default rates by origination year",
    subtitle = "based on 30-year fixed-rate mortgage",
    caption = "Data source: Fannie Mae's Single Home Loan",
    y = "Default rate",
    x = "Loan Origination Quarter")

  1. Draw a U.S. map to show the default rate based on 90 days delinquency just for the year of 2016. Which states are having the top 3 highest default rate?
datamap <- alldata |> 
  filter(orig_yr == 2016) |> 
  group_by(state, orig_yr) |> 
  summarize(D90 = sum(!is.na(F90_DTE))/length(F90_DTE))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
plot_usmap(
  data = datamap,
  labels = TRUE,
  values = "D90",
  color = "red"
) +
  scale_fill_continuous(name = "Delinquency by 90 days", low = "white", high = "red") +
  theme(legend.position = "right") 

  1. Draw a histogram to show the distribution of FICO Scores in different years. What do you notice?
#Delinquency by FICO Scores
ggplot(alldata |> filter(orig_yr >= 2016),
       aes(x = CSCORE_B,
           group = orig_yr,
           )) +
  geom_histogram(fill = "red") +
  ggtitle("FICO scores distribution") +
  labs(subtitle = ("Year: 2016-2022"),
       y = "Frequency",
       x = "FICO Score") +
  facet_wrap(~orig_yr)

  1. The way FICO score is being interpreted is very tricky. A small increase at higher score does not have the same meaning as a small increase at the lower score. Therefore, regrouping the variable is needed. Regroup the variable of FICO scores as below.
Credit.Score Rating
300-579 Poor
580-669 Fair
670-739 Good
740-799 Very Good
800-850 Exceptional
alldata |> 
  mutate(FICO_status = case_when(
           CSCORE_B < 580 ~ "Poor",
           CSCORE_B < 670 ~ "Fair",
           CSCORE_B < 740 ~ "Good",
           CSCORE_B < 800 ~ "VeryGood",
           TRUE ~ "Exceptional"
         ))  -> newdata
#head(newdata)

🏗️ Exercise 7B

The data that you have used so far is pre-processed from the raw data that you can get from Fannie Mae Single-Family Homeloan Data. Try the steps below to pre-process the data by yourself by just using the data from 2016Q1 to 2022Q3 as an exercise.

  1. Download the data
    • Go to this page. On the right panell click Access the Data. Register for an account if you have not (it is free). Click Hp (historical loan credit performance data)/Download data/Choose the year 2021Q1 to 2021Q3. The downloaded data should be placed under the data -> raw folder.
  2. Convert each file to a smaller size
    • Open 01_import_data.R (under prep folder) and run the script.
    • Your output file should have _stat.csv at the end (located in raw folder). The code has been pre-written for you in the 00_read_data.R script.
    • Try to understand the code in the 00_read_data.R to see how the data is being converted from transactions data to single account level data.
  3. Combining data
    • Run 02_combine.R (Under prep folder. It may takes a while to load. You should get a 20 million records. Yes, 20 millions row!).
  4. Analyze data
    • Now the data is ready!
    • Use the data and replicate the plots in the lecture.
    • Try to explore the questions below:
      • Which state has the highest default rate?
      • What type of customers are more risky? Those have higher LMI or lower LMI?
      • Are those customers with higher interest rate at origination has higher chances of default?
# some solutions are provided here

data <- readRDS(paste0("final data/", "2016_22_data.rds"))

# some data are duplicated
alldata <- dplyr::distinct(data)

# Convert orig_dte to date format
alldata <- alldata |>
  mutate(orig_yq = quarter(ymd(ORIG_DTE), type = "year.quarter"),
         orig_yr = year(ymd(ORIG_DTE)))

# Question: Default rate by vintage
alldata |>
  group_by(orig_yq) |>
  summarise(D180 = sum(!is.na(F180_DTE))/length(F180_DTE),
            D120 = sum(!is.na(F120_DTE))/length(F120_DTE),
            D90 = sum(!is.na(F90_DTE))/length(F90_DTE),
            D60 = sum(!is.na(F60_DTE))/length(F60_DTE)) -> delin.table

delin.table |>
  pivot_longer(cols = starts_with("D"),
               names_to = "Delinquency Days",
               values_to = "Rate") |>
  filter(orig_yq >= 2016.1) |>
  ggplot(aes(x = orig_yq,
             y = Rate,
             colour = `Delinquency Days` )) +
  geom_line() +
  labs(
    title = "Fannie Mae default rates by origination year",
    subtitle = "based on 30-year fixed-rate mortgage",
    caption = "Data source: Fannie Mae's Single Home Loan",
    y = "Default rate",
    x = "Loan Origination Quarter")

ggsave("delinquent.png")
## Saving 7 x 5 in image
# Delinquency by FICO Scores
ggplot(alldata |> filter(orig_yr >= 2016),
       aes(x = CSCORE_B,
           group = orig_yr,
       )) +
  geom_histogram(fill = "red") +
  ggtitle("FICO scores distribution from year 2016 to 2021") +
  labs(subtitle = ("Year: {closest_state}"),
       y = "Frequency",
       x = "FICO Score") +
  transition_states(orig_yr,
                    transition_length = 6,
                    state_length = 1) -> plot1
his_anim <- animate(plot1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 11254 rows containing non-finite values (`stat_bin()`).
anim_save("Fico histogram.gif", his_anim)

# US Map
datamap <- alldata |>
  filter(orig_yq >= 2016.1) |>
  group_by(state, orig_yr) |>
  summarize(D90 = sum(!is.na(F90_DTE))/length(F90_DTE))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
plot_usmap(
  data = datamap,
  labels = TRUE,
  values = "D90",
  color = "red"
) +
  scale_fill_continuous(name = "Delinquency by 90 days", low = "white", high = "red") +
  theme(legend.position = "right") -> mapPlot

transitionMap <- mapPlot +
  labs(title = "Delinquency 90 Days {as.integer(frame_time)}") +
  transition_time(orig_yr)

anim <- animate(transitionMap, fps = 10)
anim_save("map.gif")


# Question: The default rate of FM loan
alldata |>
  filter(!(is.na(F180_DTE))) |>
  group_by(time1 = quarter(ymd(F180_DTE), type = "year.quarter")) |>
  summarise(def = n()) -> tot_def

def_table <- cbind(tot_def)
i = 1
for (time2 in tot_def$time1) {
  def_table$no_cust[i] = sum(alldata$orig_yq <= time2)
  i = i +1
}

def_table |>
  mutate(rate = def/no_cust*100*4) -> def_table

def_table |>
  ggplot(aes(x = time1,
             y = rate)) +
  geom_line() +
  scale_y_continuous(
    labels = scales::number_format(accuracy = 0.01,
                                   decimal.mark = '.')) +
  labs(
    title = "Fannie Mae default rates over time",
    subtitle = "based on 30-year fixed-rate mortgage",
    caption = "Data source: Fannie Mae's Single Home Loan",
    y = "Annualized default rate (%)",
    x = "Time")

ggsave("def_rate_overtime.png")
## Saving 7 x 5 in image
# Question: The default rate by FICO
alldata |>
  filter(orig_yr >= 2016) |>
  group_by(CSCORE_B) |>
  summarise(rate = sum(!is.na(F180_DTE))/length(LOAN_ID)) |>
  ggplot(aes(x= CSCORE_B, y = rate)) +
  geom_col() +
  labs(x = 'FICO Score',
       y = 'Default rate')
## Warning: Removed 1 rows containing missing values (`position_stack()`).

# Question: The default rate by LTV
alldata |>
  filter(orig_yr >= 2016) |>
  group_by(oltv) |>
  summarise(rate = sum(!is.na(F180_DTE))/length(LOAN_ID)) |>
  ggplot(aes(x= oltv, y = rate)) +
  geom_col() +
  labs(x = 'Original Loan To Value Ratio',
       y = 'Default rate')

# Question: The default rate by DTI
alldata |>
  filter(orig_yr >= 2016) |>
  group_by(dti) |>
  summarise(rate = sum(!is.na(F180_DTE))/length(LOAN_ID)) |>
  ggplot(aes(x= dti, y = rate)) +
  geom_col() +
  labs(x = 'Debt to Income Ratio',
       y = 'Default rate')
## Warning: Removed 1 rows containing missing values (`position_stack()`).

# Question: Average credit score by vintage
alldata |>
  filter(orig_yr >= 2016) |>
  group_by(orig_yq) |>
  summarise(avg_score = mean(CSCORE_B, na.rm=T),
            sd_score_up = avg_score + 1 * sd(CSCORE_B, na.rm=T),
            sd_score_lo = avg_score - 1 * sd(CSCORE_B, na.rm=T),
            sd_score_up2 = avg_score + 2 * sd(CSCORE_B, na.rm=T),
            sd_score_lo2 = avg_score - 2 * sd(CSCORE_B, na.rm=T)) |>
  ggplot(aes(x=orig_yq, y = avg_score)) +
  geom_line() +
  geom_ribbon(aes(fill = "1 std dev", ymin = sd_score_lo, ymax = sd_score_up), alpha = 0.15) +
  geom_ribbon(aes(fill = "2 std dev", ymin = sd_score_lo2, ymax = sd_score_up2), alpha = 0.15) +
  scale_fill_manual("", values = c("red", "#fc03d3")) +
  labs(title = 'Average Credit Score by Vintage')

# Which state has the highest default rate?  
alldata |> 
  count(state) |> 
  filter(n == max(n))
##   state       n
## 1    CA 2936557
## California!

# What type of customers are more risky? Those have higher LMI or lower LMI?  
alldata |> 
  mutate(default = as.factor(ifelse(!is.na(F90_DTE), 1, 0))) |> 
  ggplot(aes(x=mi_pct, fill = default)) +
  geom_bar(position = "fill")
## Warning: Removed 14573270 rows containing non-finite values (`stat_count()`).

## No evidence from the visualization.

# Are those customers with higher interest rate at origination has higher chances of default?
alldata |> 
  mutate(default = as.factor(ifelse(!is.na(F90_DTE), 1, 0))) |> 
  ggplot(aes(x=orig_rt, fill = default)) +
  geom_bar(position = "fill")

## Yes, it looks like those have higher interest rate charged are more likely to default.