ETC5512: Instructions for Tutorial 7

🎯 Objectives

In this tutorial, you will learn to

import large data
compare default risk for different period
practice some data wrangling skill using critical thinking for analysis.

🔧 Preparation

Note: you should do preparation before the tutorial.

Installing relevant R-packages

### Required packages
list.of.packages <- c("MASS", "data.table", "tidyverse", "here", "stringr",
                      "lubridate", "ggplot2", "usmap", "gganimate", "ltm")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
invisible(lapply(list.of.packages, require, character.only = TRUE))

Unzip the ETC5512-week07.zip file provided, and notice the file structure for this week looks like:

etc5512-week07
├── tutorial-07.Rmd
├── final data
│   ├──  2016_2022_data.rds
├── raw data
├── prep
│   ├──  00_read_data.R
│   ├──  01_import_data.R
│   ├──  02_combine.R
└── etc5512-week07.Rproj

Import the data by loading the R datafile (.rds).

Note: This file has been pre-processed for you to work on the exercises in this tutorial.

This data contains the Fannie Mae Single Home Loan data from year 2016Q1 to 2022Q3.

alldata <- readRDS(here::here("final data", "2016_22_data.rds"))

🕰️ Exercise 7A

Analyzing Data

Create a table of default rate (based on 60, 90, 120 and 180 days of delinquency) by origination year and quarter. You might want to convert the ORIG_DTE to a date format first. Do you observe something peculiar from the table? Explain why.

dim(alldata)

## [1] 20059771       75

# Familiarize yourself with the variables
colnames(alldata)

##  [1] "LOAN_ID"            "ORIG_CHN"           "SELLER"            
##  [4] "loan_age"           "orig_rt"            "orig_amt"          
##  [7] "orig_trm"           "oltv"               "ocltv"             
## [10] "num_bo"             "dti"                "CSCORE_B"          
## [13] "FTHB_FLG"           "purpose"            "PROP_TYP"          
## [16] "NUM_UNIT"           "occ_stat"           "state"             
## [19] "zip_3"              "mi_pct"             "CSCORE_C"          
## [22] "relo_flg"           "MI_TYPE"            "AQSN_DTE"          
## [25] "ORIG_DTE"           "FRST_DTE"           "LAST_RT"           
## [28] "LAST_UPB"           "msa"                "FCC_COST"          
## [31] "PP_COST"            "AR_COST"            "IE_COST"           
## [34] "TAX_COST"           "NS_PROCS"           "CE_PROCS"          
## [37] "RMW_PROCS"          "O_PROCS"            "repch_flag"        
## [40] "LAST_ACTIVITY_DATE" "LPI_DTE"            "FCC_DTE"           
## [43] "DISP_DTE"           "SERVICER"           "F30_DTE"           
## [46] "F60_DTE"            "F90_DTE"            "F120_DTE"          
## [49] "F180_DTE"           "FCE_DTE"            "F180_UPB"          
## [52] "FCE_UPB"            "F30_UPB"            "F60_UPB"           
## [55] "F90_UPB"            "MOD_FLAG"           "FMOD_DTE"          
## [58] "FMOD_UPB"           "MODIR_COST"         "MODFB_COST"        
## [61] "MODFG_COST"         "MODTRM_CHNG"        "MODUPB_CHNG"       
## [64] "z_num_periods_120"  "F120_UPB"           "CSCORE_MN"         
## [67] "ORIG_VAL"           "LAST_DTE"           "LAST_STAT"         
## [70] "COMPLT_FLG"         "INT_COST"           "PFG_COST"          
## [73] "NET_LOSS"           "NET_SEV"            "MODTOT_COST"

#Convert orig_dte to date format
alldata <- alldata |> 
  mutate(orig_yq = quarter(ymd(ORIG_DTE), type = "year.quarter"),
         orig_yr = year(ymd(ORIG_DTE)))

# default rate by origination year quarter
alldata |> 
  group_by(orig_yq) |> 
  summarise(D180 = sum(!is.na(F180_DTE))/length(F180_DTE),
            D120 = sum(!is.na(F120_DTE))/length(F120_DTE),
            D90 = sum(!is.na(F90_DTE))/length(F90_DTE),
            D60 = sum(!is.na(F60_DTE))/length(F60_DTE)) -> delin.table

delin.table

## # A tibble: 42 × 5
##    orig_yq   D180   D120    D90    D60
##      <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1   2010. 0      0      0      0     
##  2   2011. 0      0      0      0     
##  3   2011. 0      0      0      0     
##  4   2012. 0      0      0      0     
##  5   2012. 0      0      0      0     
##  6   2013. 0      0      0      0     
##  7   2013. 0      0      0      0     
##  8   2014. 0      0      0      0     
##  9   2014. 0.0345 0.0345 0.0345 0.0345
## 10   2014. 0      0.0244 0.0488 0.0488
## # … with 32 more rows

delin.table |> 
  pivot_longer(cols = starts_with("D"),
               names_to = "Delinquency Days",
               values_to = "Rate") |> 
  filter(orig_yq >= 2016.1) |> 
  ggplot(aes(x = orig_yq,
             y = Rate,
             colour = `Delinquency Days` )) +
  geom_line() +
  labs(
    title = "Fannie Mae default rates by origination year",
    subtitle = "based on 30-year fixed-rate mortgage",
    caption = "Data source: Fannie Mae's Single Home Loan",
    y = "Default rate",
    x = "Loan Origination Quarter")

Remember that we only get the data from 2016 onward, but the loan can be started before Fannie Mae acquired the loan from the financial institution, so, we just have to discard all information before 2016.
Generally, the defaulted rate are quite consistent except during the onset of COVID.
The default rate is low for loan originated from year 2021 onward. Why? This is because the loan are still new! The highest default rate normally occurs during year 5 of the loan. How interesting! Think further on why this is happening.

Draw a U.S. map to show the default rate based on 90 days delinquency just for the year of 2016. Which states are having the top 3 highest default rate?

datamap <- alldata |> 
  filter(orig_yr == 2016) |> 
  group_by(state, orig_yr) |> 
  summarize(D90 = sum(!is.na(F90_DTE))/length(F90_DTE))

## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.

plot_usmap(
  data = datamap,
  labels = TRUE,
  values = "D90",
  color = "red"
) +
  scale_fill_continuous(name = "Delinquency by 90 days", low = "white", high = "red") +
  theme(legend.position = "right")

Texas, Nevada, Florida and New York city are the worst in term of having the highest default rate.

Draw a histogram to show the distribution of FICO Scores in different years. What do you notice?

#Delinquency by FICO Scores
ggplot(alldata |> filter(orig_yr >= 2016),
       aes(x = CSCORE_B,
           group = orig_yr,
           )) +
  geom_histogram(fill = "red") +
  ggtitle("FICO scores distribution") +
  labs(subtitle = ("Year: 2016-2022"),
       y = "Frequency",
       x = "FICO Score") +
  facet_wrap(~orig_yr)

More borrowing from higher FICO scores group.

The way FICO score is being interpreted is very tricky. A small increase at higher score does not have the same meaning as a small increase at the lower score. Therefore, regrouping the variable is needed. Regroup the variable of FICO scores as below.

Credit.Score	Rating
300-579	Poor
580-669	Fair
670-739	Good
740-799	Very Good
800-850	Exceptional

alldata |> 
  mutate(FICO_status = case_when(
           CSCORE_B < 580 ~ "Poor",
           CSCORE_B < 670 ~ "Fair",
           CSCORE_B < 740 ~ "Good",
           CSCORE_B < 800 ~ "VeryGood",
           TRUE ~ "Exceptional"
         ))  -> newdata
#head(newdata)

🏗️ Exercise 7B

The data that you have used so far is pre-processed from the raw data that you can get from Fannie Mae Single-Family Homeloan Data. Try the steps below to pre-process the data by yourself by just using the data from 2016Q1 to 2022Q3 as an exercise.

Download the data
- Go to this page. On the right panell click Access the Data. Register for an account if you have not (it is free). Click Hp (historical loan credit performance data)/Download data/Choose the year 2021Q1 to 2021Q3. The downloaded data should be placed under the data -> raw folder.
Convert each file to a smaller size
- Open 01_import_data.R (under prep folder) and run the script.
- Your output file should have _stat.csv at the end (located in raw folder). The code has been pre-written for you in the 00_read_data.R script.
- Try to understand the code in the 00_read_data.R to see how the data is being converted from transactions data to single account level data.
Combining data
- Run 02_combine.R (Under prep folder. It may takes a while to load. You should get a 20 million records. Yes, 20 millions row!).
Analyze data
- Now the data is ready!
- Use the data and replicate the plots in the lecture.
- Try to explore the questions below:
  - Which state has the highest default rate?
  - What type of customers are more risky? Those have higher LMI or lower LMI?
  - Are those customers with higher interest rate at origination has higher chances of default?

# some solutions are provided here

data <- readRDS(paste0("final data/", "2016_22_data.rds"))

# some data are duplicated
alldata <- dplyr::distinct(data)

# Convert orig_dte to date format
alldata <- alldata |>
  mutate(orig_yq = quarter(ymd(ORIG_DTE), type = "year.quarter"),
         orig_yr = year(ymd(ORIG_DTE)))

# Question: Default rate by vintage
alldata |>
  group_by(orig_yq) |>
  summarise(D180 = sum(!is.na(F180_DTE))/length(F180_DTE),
            D120 = sum(!is.na(F120_DTE))/length(F120_DTE),
            D90 = sum(!is.na(F90_DTE))/length(F90_DTE),
            D60 = sum(!is.na(F60_DTE))/length(F60_DTE)) -> delin.table

delin.table |>
  pivot_longer(cols = starts_with("D"),
               names_to = "Delinquency Days",
               values_to = "Rate") |>
  filter(orig_yq >= 2016.1) |>
  ggplot(aes(x = orig_yq,
             y = Rate,
             colour = `Delinquency Days` )) +
  geom_line() +
  labs(
    title = "Fannie Mae default rates by origination year",
    subtitle = "based on 30-year fixed-rate mortgage",
    caption = "Data source: Fannie Mae's Single Home Loan",
    y = "Default rate",
    x = "Loan Origination Quarter")

ggsave("delinquent.png")

## Saving 7 x 5 in image

# Delinquency by FICO Scores
ggplot(alldata |> filter(orig_yr >= 2016),
       aes(x = CSCORE_B,
           group = orig_yr,
       )) +
  geom_histogram(fill = "red") +
  ggtitle("FICO scores distribution from year 2016 to 2021") +
  labs(subtitle = ("Year: {closest_state}"),
       y = "Frequency",
       x = "FICO Score") +
  transition_states(orig_yr,
                    transition_length = 6,
                    state_length = 1) -> plot1
his_anim <- animate(plot1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 11254 rows containing non-finite values (`stat_bin()`).

anim_save("Fico histogram.gif", his_anim)

# US Map
datamap <- alldata |>
  filter(orig_yq >= 2016.1) |>
  group_by(state, orig_yr) |>
  summarize(D90 = sum(!is.na(F90_DTE))/length(F90_DTE))

## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.

plot_usmap(
  data = datamap,
  labels = TRUE,
  values = "D90",
  color = "red"
) +
  scale_fill_continuous(name = "Delinquency by 90 days", low = "white", high = "red") +
  theme(legend.position = "right") -> mapPlot

transitionMap <- mapPlot +
  labs(title = "Delinquency 90 Days {as.integer(frame_time)}") +
  transition_time(orig_yr)

anim <- animate(transitionMap, fps = 10)
anim_save("map.gif")


# Question: The default rate of FM loan
alldata |>
  filter(!(is.na(F180_DTE))) |>
  group_by(time1 = quarter(ymd(F180_DTE), type = "year.quarter")) |>
  summarise(def = n()) -> tot_def

def_table <- cbind(tot_def)
i = 1
for (time2 in tot_def$time1) {
  def_table$no_cust[i] = sum(alldata$orig_yq <= time2)
  i = i +1
}

def_table |>
  mutate(rate = def/no_cust*100*4) -> def_table

def_table |>
  ggplot(aes(x = time1,
             y = rate)) +
  geom_line() +
  scale_y_continuous(
    labels = scales::number_format(accuracy = 0.01,
                                   decimal.mark = '.')) +
  labs(
    title = "Fannie Mae default rates over time",
    subtitle = "based on 30-year fixed-rate mortgage",
    caption = "Data source: Fannie Mae's Single Home Loan",
    y = "Annualized default rate (%)",
    x = "Time")

ggsave("def_rate_overtime.png")

## Saving 7 x 5 in image

# Question: The default rate by FICO
alldata |>
  filter(orig_yr >= 2016) |>
  group_by(CSCORE_B) |>
  summarise(rate = sum(!is.na(F180_DTE))/length(LOAN_ID)) |>
  ggplot(aes(x= CSCORE_B, y = rate)) +
  geom_col() +
  labs(x = 'FICO Score',
       y = 'Default rate')

## Warning: Removed 1 rows containing missing values (`position_stack()`).

# Question: The default rate by LTV
alldata |>
  filter(orig_yr >= 2016) |>
  group_by(oltv) |>
  summarise(rate = sum(!is.na(F180_DTE))/length(LOAN_ID)) |>
  ggplot(aes(x= oltv, y = rate)) +
  geom_col() +
  labs(x = 'Original Loan To Value Ratio',
       y = 'Default rate')

# Question: The default rate by DTI
alldata |>
  filter(orig_yr >= 2016) |>
  group_by(dti) |>
  summarise(rate = sum(!is.na(F180_DTE))/length(LOAN_ID)) |>
  ggplot(aes(x= dti, y = rate)) +
  geom_col() +
  labs(x = 'Debt to Income Ratio',
       y = 'Default rate')

## Warning: Removed 1 rows containing missing values (`position_stack()`).

# Question: Average credit score by vintage
alldata |>
  filter(orig_yr >= 2016) |>
  group_by(orig_yq) |>
  summarise(avg_score = mean(CSCORE_B, na.rm=T),
            sd_score_up = avg_score + 1 * sd(CSCORE_B, na.rm=T),
            sd_score_lo = avg_score - 1 * sd(CSCORE_B, na.rm=T),
            sd_score_up2 = avg_score + 2 * sd(CSCORE_B, na.rm=T),
            sd_score_lo2 = avg_score - 2 * sd(CSCORE_B, na.rm=T)) |>
  ggplot(aes(x=orig_yq, y = avg_score)) +
  geom_line() +
  geom_ribbon(aes(fill = "1 std dev", ymin = sd_score_lo, ymax = sd_score_up), alpha = 0.15) +
  geom_ribbon(aes(fill = "2 std dev", ymin = sd_score_lo2, ymax = sd_score_up2), alpha = 0.15) +
  scale_fill_manual("", values = c("red", "#fc03d3")) +
  labs(title = 'Average Credit Score by Vintage')

# Which state has the highest default rate?  
alldata |> 
  count(state) |> 
  filter(n == max(n))

##   state       n
## 1    CA 2936557

## California!

# What type of customers are more risky? Those have higher LMI or lower LMI?  
alldata |> 
  mutate(default = as.factor(ifelse(!is.na(F90_DTE), 1, 0))) |> 
  ggplot(aes(x=mi_pct, fill = default)) +
  geom_bar(position = "fill")

## Warning: Removed 14573270 rows containing non-finite values (`stat_count()`).

## No evidence from the visualization.

# Are those customers with higher interest rate at origination has higher chances of default?
alldata |> 
  mutate(default = as.factor(ifelse(!is.na(F90_DTE), 1, 0))) |> 
  ggplot(aes(x=orig_rt, fill = default)) +
  geom_bar(position = "fill")

## Yes, it looks like those have higher interest rate charged are more likely to default.

ETC5512: Instructions for Tutorial 7

Joan Tan

Week 7 Credit Risk

🎯 Objectives

🔧 Preparation

🕰️ Exercise 7A

🏗️ Exercise 7B