In this tutorial, you will learn to
Note: you should do preparation before the tutorial.
### Required packages
list.of.packages <- c("MASS", "data.table", "tidyverse", "here", "stringr",
"lubridate", "ggplot2", "usmap", "gganimate", "ltm")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
invisible(lapply(list.of.packages, require, character.only = TRUE))
etc5512-week07
├── tutorial-07.Rmd
├── final data
│ ├── 2016_2022_data.rds
├── raw data
├── prep
│ ├── 00_read_data.R
│ ├── 01_import_data.R
│ ├── 02_combine.R
└── etc5512-week07.Rproj
Note: This file has been pre-processed for you to work on the exercises in this tutorial.
This data contains the Fannie Mae Single Home Loan data from year 2016Q1 to 2022Q3.
alldata <- readRDS(here::here("final data", "2016_22_data.rds"))
Analyzing Data
ORIG_DTE
to a date format first. Do you observe
something peculiar from the table? Explain why.dim(alldata)
## [1] 20059771 75
# Familiarize yourself with the variables
colnames(alldata)
## [1] "LOAN_ID" "ORIG_CHN" "SELLER"
## [4] "loan_age" "orig_rt" "orig_amt"
## [7] "orig_trm" "oltv" "ocltv"
## [10] "num_bo" "dti" "CSCORE_B"
## [13] "FTHB_FLG" "purpose" "PROP_TYP"
## [16] "NUM_UNIT" "occ_stat" "state"
## [19] "zip_3" "mi_pct" "CSCORE_C"
## [22] "relo_flg" "MI_TYPE" "AQSN_DTE"
## [25] "ORIG_DTE" "FRST_DTE" "LAST_RT"
## [28] "LAST_UPB" "msa" "FCC_COST"
## [31] "PP_COST" "AR_COST" "IE_COST"
## [34] "TAX_COST" "NS_PROCS" "CE_PROCS"
## [37] "RMW_PROCS" "O_PROCS" "repch_flag"
## [40] "LAST_ACTIVITY_DATE" "LPI_DTE" "FCC_DTE"
## [43] "DISP_DTE" "SERVICER" "F30_DTE"
## [46] "F60_DTE" "F90_DTE" "F120_DTE"
## [49] "F180_DTE" "FCE_DTE" "F180_UPB"
## [52] "FCE_UPB" "F30_UPB" "F60_UPB"
## [55] "F90_UPB" "MOD_FLAG" "FMOD_DTE"
## [58] "FMOD_UPB" "MODIR_COST" "MODFB_COST"
## [61] "MODFG_COST" "MODTRM_CHNG" "MODUPB_CHNG"
## [64] "z_num_periods_120" "F120_UPB" "CSCORE_MN"
## [67] "ORIG_VAL" "LAST_DTE" "LAST_STAT"
## [70] "COMPLT_FLG" "INT_COST" "PFG_COST"
## [73] "NET_LOSS" "NET_SEV" "MODTOT_COST"
#Convert orig_dte to date format
alldata <- alldata |>
mutate(orig_yq = quarter(ymd(ORIG_DTE), type = "year.quarter"),
orig_yr = year(ymd(ORIG_DTE)))
# default rate by origination year quarter
alldata |>
group_by(orig_yq) |>
summarise(D180 = sum(!is.na(F180_DTE))/length(F180_DTE),
D120 = sum(!is.na(F120_DTE))/length(F120_DTE),
D90 = sum(!is.na(F90_DTE))/length(F90_DTE),
D60 = sum(!is.na(F60_DTE))/length(F60_DTE)) -> delin.table
delin.table
## # A tibble: 42 × 5
## orig_yq D180 D120 D90 D60
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010. 0 0 0 0
## 2 2011. 0 0 0 0
## 3 2011. 0 0 0 0
## 4 2012. 0 0 0 0
## 5 2012. 0 0 0 0
## 6 2013. 0 0 0 0
## 7 2013. 0 0 0 0
## 8 2014. 0 0 0 0
## 9 2014. 0.0345 0.0345 0.0345 0.0345
## 10 2014. 0 0.0244 0.0488 0.0488
## # … with 32 more rows
delin.table |>
pivot_longer(cols = starts_with("D"),
names_to = "Delinquency Days",
values_to = "Rate") |>
filter(orig_yq >= 2016.1) |>
ggplot(aes(x = orig_yq,
y = Rate,
colour = `Delinquency Days` )) +
geom_line() +
labs(
title = "Fannie Mae default rates by origination year",
subtitle = "based on 30-year fixed-rate mortgage",
caption = "Data source: Fannie Mae's Single Home Loan",
y = "Default rate",
x = "Loan Origination Quarter")
datamap <- alldata |>
filter(orig_yr == 2016) |>
group_by(state, orig_yr) |>
summarize(D90 = sum(!is.na(F90_DTE))/length(F90_DTE))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
plot_usmap(
data = datamap,
labels = TRUE,
values = "D90",
color = "red"
) +
scale_fill_continuous(name = "Delinquency by 90 days", low = "white", high = "red") +
theme(legend.position = "right")
#Delinquency by FICO Scores
ggplot(alldata |> filter(orig_yr >= 2016),
aes(x = CSCORE_B,
group = orig_yr,
)) +
geom_histogram(fill = "red") +
ggtitle("FICO scores distribution") +
labs(subtitle = ("Year: 2016-2022"),
y = "Frequency",
x = "FICO Score") +
facet_wrap(~orig_yr)
Credit.Score | Rating |
---|---|
300-579 | Poor |
580-669 | Fair |
670-739 | Good |
740-799 | Very Good |
800-850 | Exceptional |
alldata |>
mutate(FICO_status = case_when(
CSCORE_B < 580 ~ "Poor",
CSCORE_B < 670 ~ "Fair",
CSCORE_B < 740 ~ "Good",
CSCORE_B < 800 ~ "VeryGood",
TRUE ~ "Exceptional"
)) -> newdata
#head(newdata)
The data that you have used so far is pre-processed from the raw data that you can get from Fannie Mae Single-Family Homeloan Data. Try the steps below to pre-process the data by yourself by just using the data from 2016Q1 to 2022Q3 as an exercise.
Access the Data
. Register for an
account if you have not (it is free). Click
Hp (historical loan credit performance data)
/Download data
/Choose
the year 2021Q1 to 2021Q3. The downloaded data should be placed under
the data
-> raw
folder.01_import_data.R
(under prep
folder)
and run the script._stat.csv
at the end
(located in raw folder). The code has been pre-written for you in the
00_read_data.R
script.00_read_data.R
to see
how the data is being converted from transactions data to single account
level data.02_combine.R
(Under prep
folder. It
may takes a while to load. You should get a 20 million records. Yes, 20
millions row!).# some solutions are provided here
data <- readRDS(paste0("final data/", "2016_22_data.rds"))
# some data are duplicated
alldata <- dplyr::distinct(data)
# Convert orig_dte to date format
alldata <- alldata |>
mutate(orig_yq = quarter(ymd(ORIG_DTE), type = "year.quarter"),
orig_yr = year(ymd(ORIG_DTE)))
# Question: Default rate by vintage
alldata |>
group_by(orig_yq) |>
summarise(D180 = sum(!is.na(F180_DTE))/length(F180_DTE),
D120 = sum(!is.na(F120_DTE))/length(F120_DTE),
D90 = sum(!is.na(F90_DTE))/length(F90_DTE),
D60 = sum(!is.na(F60_DTE))/length(F60_DTE)) -> delin.table
delin.table |>
pivot_longer(cols = starts_with("D"),
names_to = "Delinquency Days",
values_to = "Rate") |>
filter(orig_yq >= 2016.1) |>
ggplot(aes(x = orig_yq,
y = Rate,
colour = `Delinquency Days` )) +
geom_line() +
labs(
title = "Fannie Mae default rates by origination year",
subtitle = "based on 30-year fixed-rate mortgage",
caption = "Data source: Fannie Mae's Single Home Loan",
y = "Default rate",
x = "Loan Origination Quarter")
ggsave("delinquent.png")
## Saving 7 x 5 in image
# Delinquency by FICO Scores
ggplot(alldata |> filter(orig_yr >= 2016),
aes(x = CSCORE_B,
group = orig_yr,
)) +
geom_histogram(fill = "red") +
ggtitle("FICO scores distribution from year 2016 to 2021") +
labs(subtitle = ("Year: {closest_state}"),
y = "Frequency",
x = "FICO Score") +
transition_states(orig_yr,
transition_length = 6,
state_length = 1) -> plot1
his_anim <- animate(plot1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 11254 rows containing non-finite values (`stat_bin()`).
anim_save("Fico histogram.gif", his_anim)
# US Map
datamap <- alldata |>
filter(orig_yq >= 2016.1) |>
group_by(state, orig_yr) |>
summarize(D90 = sum(!is.na(F90_DTE))/length(F90_DTE))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
plot_usmap(
data = datamap,
labels = TRUE,
values = "D90",
color = "red"
) +
scale_fill_continuous(name = "Delinquency by 90 days", low = "white", high = "red") +
theme(legend.position = "right") -> mapPlot
transitionMap <- mapPlot +
labs(title = "Delinquency 90 Days {as.integer(frame_time)}") +
transition_time(orig_yr)
anim <- animate(transitionMap, fps = 10)
anim_save("map.gif")
# Question: The default rate of FM loan
alldata |>
filter(!(is.na(F180_DTE))) |>
group_by(time1 = quarter(ymd(F180_DTE), type = "year.quarter")) |>
summarise(def = n()) -> tot_def
def_table <- cbind(tot_def)
i = 1
for (time2 in tot_def$time1) {
def_table$no_cust[i] = sum(alldata$orig_yq <= time2)
i = i +1
}
def_table |>
mutate(rate = def/no_cust*100*4) -> def_table
def_table |>
ggplot(aes(x = time1,
y = rate)) +
geom_line() +
scale_y_continuous(
labels = scales::number_format(accuracy = 0.01,
decimal.mark = '.')) +
labs(
title = "Fannie Mae default rates over time",
subtitle = "based on 30-year fixed-rate mortgage",
caption = "Data source: Fannie Mae's Single Home Loan",
y = "Annualized default rate (%)",
x = "Time")
ggsave("def_rate_overtime.png")
## Saving 7 x 5 in image
# Question: The default rate by FICO
alldata |>
filter(orig_yr >= 2016) |>
group_by(CSCORE_B) |>
summarise(rate = sum(!is.na(F180_DTE))/length(LOAN_ID)) |>
ggplot(aes(x= CSCORE_B, y = rate)) +
geom_col() +
labs(x = 'FICO Score',
y = 'Default rate')
## Warning: Removed 1 rows containing missing values (`position_stack()`).
# Question: The default rate by LTV
alldata |>
filter(orig_yr >= 2016) |>
group_by(oltv) |>
summarise(rate = sum(!is.na(F180_DTE))/length(LOAN_ID)) |>
ggplot(aes(x= oltv, y = rate)) +
geom_col() +
labs(x = 'Original Loan To Value Ratio',
y = 'Default rate')
# Question: The default rate by DTI
alldata |>
filter(orig_yr >= 2016) |>
group_by(dti) |>
summarise(rate = sum(!is.na(F180_DTE))/length(LOAN_ID)) |>
ggplot(aes(x= dti, y = rate)) +
geom_col() +
labs(x = 'Debt to Income Ratio',
y = 'Default rate')
## Warning: Removed 1 rows containing missing values (`position_stack()`).
# Question: Average credit score by vintage
alldata |>
filter(orig_yr >= 2016) |>
group_by(orig_yq) |>
summarise(avg_score = mean(CSCORE_B, na.rm=T),
sd_score_up = avg_score + 1 * sd(CSCORE_B, na.rm=T),
sd_score_lo = avg_score - 1 * sd(CSCORE_B, na.rm=T),
sd_score_up2 = avg_score + 2 * sd(CSCORE_B, na.rm=T),
sd_score_lo2 = avg_score - 2 * sd(CSCORE_B, na.rm=T)) |>
ggplot(aes(x=orig_yq, y = avg_score)) +
geom_line() +
geom_ribbon(aes(fill = "1 std dev", ymin = sd_score_lo, ymax = sd_score_up), alpha = 0.15) +
geom_ribbon(aes(fill = "2 std dev", ymin = sd_score_lo2, ymax = sd_score_up2), alpha = 0.15) +
scale_fill_manual("", values = c("red", "#fc03d3")) +
labs(title = 'Average Credit Score by Vintage')
# Which state has the highest default rate?
alldata |>
count(state) |>
filter(n == max(n))
## state n
## 1 CA 2936557
## California!
# What type of customers are more risky? Those have higher LMI or lower LMI?
alldata |>
mutate(default = as.factor(ifelse(!is.na(F90_DTE), 1, 0))) |>
ggplot(aes(x=mi_pct, fill = default)) +
geom_bar(position = "fill")
## Warning: Removed 14573270 rows containing non-finite values (`stat_count()`).
## No evidence from the visualization.
# Are those customers with higher interest rate at origination has higher chances of default?
alldata |>
mutate(default = as.factor(ifelse(!is.na(F90_DTE), 1, 0))) |>
ggplot(aes(x=orig_rt, fill = default)) +
geom_bar(position = "fill")
## Yes, it looks like those have higher interest rate charged are more likely to default.