import pandas as pd
from statsmodels.formula.api import ols as sm_ols
import numpy as np
import seaborn as sns
from statsmodels.iolib.summary2 import summary_col # nicer tables
url = 'https://github.com/LeDataSciFi/ledatascifi-2022/blob/main/data/Fannie_Mae_Plus_Data.gzip?raw=true'
fannie_mae = pd.read_csv(url,compression='gzip')
fannie_mae["Origination_Date"]
0 2007-02-01
1 2007-02-01
2 2007-02-01
3 2007-02-01
4 2007-02-01
...
135033 2018-06-01
135034 2018-06-01
135035 2018-06-01
135036 2018-06-01
135037 2018-06-01
Name: Origination_Date, Length: 135038, dtype: object
fannie_mae = (fannie_mae
# create variables
.assign(l_credscore = np.log(fannie_mae['Borrower_Credit_Score_at_Origination']),
l_LTV = np.log(fannie_mae['Original_LTV_(OLTV)']),
l_int = np.log(fannie_mae['Original_Interest_Rate']),
Origination_Date = lambda x: pd.to_datetime(x['Origination_Date']),
Origination_Year = lambda x: x['Origination_Date'].dt.year,
const = 1
)
.rename(columns={'Original_Interest_Rate':'int'}) # shorter name will help the table formatting
)
# create a categorical credit bin var with "pd.cut()"
fannie_mae['creditbins']= pd.cut(fannie_mae['Co-borrower_credit_score_at_origination'],
[0,579,669,739,799,850],
labels=['Very Poor','Fair','Good','Very Good','Exceptional'])
fannie_mae
Loan_Identifier | Origination_Channel | Seller_Name | int | Original_UPB | Original_Loan_Term | Original_LTV_(OLTV) | Original_Combined_LTV_(CLTV) | Number_of_Borrowers | Original_Debt_to_Income_Ratio | ... | BOPGSTB | GOLDAMGBD228NLBM | CSUSHPISA | MSPUS | l_credscore | l_LTV | l_int | Origination_Year | const | creditbins | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9.733730e+11 | B | OTHER | 6.875 | 32000.0 | 360.0 | 90.0 | 90.0 | 1.0 | 22.0 | ... | -58478.0 | 665.1025 | 184.601 | 257400.0 | 6.505784 | 4.499810 | 1.927892 | 2007 | 1 | NaN |
1 | 9.276200e+11 | B | PNC BANK, N.A. | 5.875 | 200000.0 | 360.0 | 80.0 | 80.0 | 2.0 | 26.0 | ... | -58478.0 | 665.1025 | 184.601 | 257400.0 | 6.541030 | 4.382027 | 1.770706 | 2007 | 1 | Good |
2 | 7.176670e+11 | B | OTHER | 6.250 | 122000.0 | 180.0 | 80.0 | 80.0 | 2.0 | 31.0 | ... | -58478.0 | 665.1025 | 184.601 | 257400.0 | 6.608001 | 4.382027 | 1.832581 | 2007 | 1 | Very Good |
3 | 9.889510e+11 | C | AMTRUST BANK | 6.000 | 67000.0 | 180.0 | 77.0 | 77.0 | 2.0 | 17.0 | ... | -58478.0 | 665.1025 | 184.601 | 257400.0 | 6.689599 | 4.343805 | 1.791759 | 2007 | 1 | Exceptional |
4 | 1.908850e+11 | R | OTHER | 5.875 | 50000.0 | 180.0 | 41.0 | 41.0 | 2.0 | 10.0 | ... | -58478.0 | 665.1025 | 184.601 | 257400.0 | 6.489205 | 3.713572 | 1.770706 | 2007 | 1 | Good |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
135033 | 9.204900e+11 | R | OTHER | 4.625 | 200000.0 | 240.0 | 50.0 | 50.0 | 1.0 | 45.0 | ... | -47431.0 | 1299.1500 | 202.411 | 315600.0 | 6.575076 | 3.912023 | 1.531476 | 2018 | 1 | NaN |
135034 | 9.666890e+11 | R | OTHER | 4.625 | 94000.0 | 360.0 | 47.0 | 47.0 | 1.0 | 39.0 | ... | -47431.0 | 1299.1500 | 202.411 | 315600.0 | 6.517671 | 3.850148 | 1.531476 | 2018 | 1 | NaN |
135035 | 6.616280e+11 | R | OTHER | 4.625 | 239000.0 | 360.0 | 74.0 | 74.0 | 2.0 | 20.0 | ... | -47431.0 | 1299.1500 | 202.411 | 315600.0 | 6.669498 | 4.304065 | 1.531476 | 2018 | 1 | Good |
135036 | 5.102850e+11 | R | OTHER | 5.000 | 93000.0 | 360.0 | 44.0 | 44.0 | 1.0 | 19.0 | ... | -47431.0 | 1299.1500 | 202.411 | 315600.0 | 6.431331 | 3.784190 | 1.609438 | 2018 | 1 | NaN |
135037 | 4.330130e+11 | R | OTHER | 5.125 | 140000.0 | 360.0 | 94.0 | 94.0 | 1.0 | 36.0 | ... | -47431.0 | 1299.1500 | 202.411 | 315600.0 | 6.492240 | 4.543295 | 1.634131 | 2018 | 1 | NaN |
135038 rows × 42 columns
As before, the psuedocode:
model = sm_ols(<formula>, data=<dataframe>)
result=model.fit()
# you use result to print summary, get predicted values (.predict) or residuals (.resid)
Now, let’s save each regression’s result with a different name, and below this, output them all in one nice table:
# one var: 'y ~ x' means fit y = a + b*X
reg1 = sm_ols('int ~ Borrower_Credit_Score_at_Origination ', data=fannie_mae).fit()
reg1b= sm_ols('int ~ l_credscore ', data=fannie_mae).fit()
reg1c= sm_ols('l_int ~ Borrower_Credit_Score_at_Origination ', data=fannie_mae).fit()
reg1d= sm_ols('l_int ~ l_credscore ', data=fannie_mae).fit()
# multiple variables: just add them to the formula
# 'y ~ x1 + x2' means fit y = a + b*x1 + c*x2
reg2 = sm_ols('int ~ l_credscore + l_LTV ', data=fannie_mae).fit()
# interaction terms: Just use *
# Note: always include each variable separately too! (not just x1*x2, but x1+x2+x1*x2)
reg3 = sm_ols('int ~ l_credscore + l_LTV + l_credscore*l_LTV', data=fannie_mae).fit()
# categorical dummies: C()
reg4 = sm_ols('int ~ C(creditbins) ', data=fannie_mae).fit()
reg5 = sm_ols('int ~ C(creditbins) -1', data=fannie_mae).fit()
Ok, time to output them:
# now I'll format an output table
# I'd like to include extra info in the table (not just coefficients)
info_dict={'R-squared' : lambda x: f"{x.rsquared:.2f}",
'Adj R-squared' : lambda x: f"{x.rsquared_adj:.2f}",
'No. observations' : lambda x: f"{int(x.nobs):d}"}
# q4b1 and q4b2 name the dummies differently in the table, so this is a silly fix
reg4.model.exog_names[1:] = reg5.model.exog_names[1:]
# This summary col function combines a bunch of regressions into one nice table
print('='*108)
print(' y = interest rate if not specified, log(interest rate else)')
print(summary_col(results=[reg1,reg1b,reg1c,reg1d,reg2,reg3,reg4,reg5], # list the result obj here
float_format='%0.2f',
stars = True, # stars are easy way to see if anything is statistically significant
model_names=['1','2',' 3 (log)','4 (log)','5','6','7','8'], # these are bad names, lol. Usually, just use the y variable name
info_dict=info_dict,
regressor_order=[ 'Intercept','Borrower_Credit_Score_at_Origination','l_credscore','l_LTV','l_credscore:l_LTV',
'C(creditbins)[Very Poor]','C(creditbins)[Fair]','C(creditbins)[Good]','C(creditbins)[Vrey Good]','C(creditbins)[Exceptional]']
)
)
============================================================================================================
y = interest rate if not specified, log(interest rate else)
============================================================================================================
1 2 3 (log) 4 (log) 5 6 7 8
------------------------------------------------------------------------------------------------------------
Intercept 11.58*** 45.37*** 2.87*** 9.50*** 44.13*** -16.81*** 6.65***
(0.05) (0.29) (0.01) (0.06) (0.30) (4.11) (0.08)
Borrower_Credit_Score_at_Origination -0.01*** -0.00***
(0.00) (0.00)
l_credscore -6.07*** -1.19*** -5.99*** 3.22***
(0.04) (0.01) (0.04) (0.62)
l_LTV 0.15*** 14.61***
(0.01) (0.97)
l_credscore:l_LTV -2.18***
(0.15)
C(creditbins)[Very Poor] 6.65***
(0.08)
C(creditbins)[Fair] -0.63*** 6.02***
(0.08) (0.02)
C(creditbins)[Good] -1.17*** 5.48***
(0.08) (0.01)
C(creditbins)[Exceptional] -2.25*** 4.40***
(0.08) (0.01)
C(creditbins)[Very Good] -1.65*** 5.00***
(0.08) (0.01)
R-squared 0.13 0.12 0.13 0.12 0.13 0.13 0.11 0.11
R-squared Adj. 0.13 0.12 0.13 0.12 0.13 0.13 0.11 0.11
R-squared 0.13 0.12 0.13 0.12 0.13 0.13 0.11 0.11
Adj R-squared 0.13 0.12 0.13 0.12 0.13 0.13 0.11 0.11
No. observations 134481 134481 134481 134481 134481 134481 67366 67366
============================================================================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
You might need to print out a few individual regressions with more decimals.
Interpret coefs in model 6 (and visually?)