HW5

pdf

School

University of California, Los Angeles *

*We aren’t endorsed by this school

Course

101C

Subject

Computer Science

Date

Jan 9, 2024

Type

pdf

Pages

18

Uploaded by ShashvatPatel1234

Report
Homework 4 Neftali Lemus (805105856) 11/3/2021 setwd ( "C:/Users/domin/Desktop/Stats 101C" ) college <- read.csv ( "College Fall 2021.csv" ) dim (college) ## [1] 2000 20 names (college) ## [1] "X" "School.Name" "Private" "Apps" "Accept" ## [6] "Enroll" "Top10perc" "Top25perc" "F.Undergrad" "P.Undergrad" ## [11] "Outstate" "Room.Board" "Books" "Personal" "PhD" ## [16] "Terminal" "S.F.Ratio" "perc.alumni" "Grad.Rate" "Expend" table ( is.na (college)) # Data has no missing values, we can proceed with no issues ## ## FALSE ## 40000 college <-college[, - c ( 1 , 2 , 3 )] #Remove troublesome columns dim (college) ## [1] 2000 17 Question 1 #Split Data into 70% training, 30% testing set.seed ( 1128 ) c.training.i <- sample ( 1 : 2000 , 1400 , replace = F) college.train <- college[c.training.i,] college.test <- college[ - c.training.i,] dim (college.train) ## [1] 1400 17 1
dim (college.test) ## [1] 600 17 a) m1 <- lm (Expend ~ ., data = college.train) pred.train <- predict (m1, newdata= college.train) #Calculating MSE Training MSE.train <- mean ((college.train $ Expend - pred.train) ^ 2 ) MSE.train ## [1] 8423397 #Calculating MSE Test pred.test <- predict (m1, newdata= college.test) MSE.test <- mean ((college.test $ Expend - pred.test) ^ 2 ) MSE.test ## [1] 9658396 b) #Ridge Regression str (college) ## ’data.frame’: 2000 obs. of 17 variables: ## $ Apps : int 1758 14463 838 1127 735 504 280 1373 1455 2379 ... ## $ Accept : int 1485 6166 651 884 423 482 143 1373 1064 2133 ... ## $ Enroll : int 419 1757 159 308 366 185 79 724 452 1292 ... ## $ Top10perc : int 27 60 11 30 20 10 5 6 1 8 ... ## $ Top25perc : int 58 94 25 64 48 36 27 21 16 25 ... ## $ F.Undergrad: int 2041 8544 654 1310 2448 550 327 2754 2632 4283 ... ## $ P.Undergrad: int 174 671 162 766 707 84 110 474 617 2973 ... ## $ Outstate : int 12040 6550 8640 11718 9210 9130 5590 2700 6806 4973 ... ## $ Room.Board : int 4100 4598 3700 7398 3782 3322 2900 2660 2550 3192 ... ## $ Books : int 600 700 400 450 700 450 650 540 350 500 ... ## $ Personal : int 1100 1000 1915 1800 1000 1450 1952 1660 766 1425 ... ## $ PhD : int 92 83 62 73 49 46 53 60 75 56 ... ## $ Terminal : int 96 100 62 87 51 51 63 68 75 65 ... ## $ S.F.Ratio : num 13.2 18 12.2 16.4 39.8 12.6 15.1 20.3 15.1 22 ... ## $ perc.alumni: int 17 15 13 33 15 25 4 29 10 21 ... ## $ Grad.Rate : int 72 80 48 76 34 54 90 52 24 38 ... ## $ Expend : int 9060 8055 7634 8871 6562 8686 4839 4550 6972 4078 ... i= seq ( 10 , - 2 , length= 100 ) lambda.v= 10 ^ i length (lambda.v) 2
## [1] 100 #No missing Values, No factors, Predictors in Matrix, Response in Vector (Data Prep) X <- model.matrix (Expend ~ .,college) Y <- college $ Expend #Fit Ridge Regression Model library (glmnet) ## Loading required package: Matrix ## Loaded glmnet 4.1-2 c.model.ridge <- glmnet (X,Y, alpha = 0 , lambda = lambda.v) # Alpha = 0 gives ridge plot (c.model.ridge) # 16 coefficients survived 0 100 200 300 400 500 -300 -200 -100 0 100 L1 Norm Coefficients 16 16 16 16 16 16 #Find the optimal lambda value via cross validation set.seed ( 1128 ) cv.out.ridge <- cv.glmnet (X,Y, alpha= 0 ) bestlam.cv.ridge <- cv.out.ridge $ lambda.min bestlam.cv.ridge ## [1] 355.5073 3
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
#Compute the test error with bestlam.cv for Test and Train ridge.pred.test <- predict (c.model.ridge, s= bestlam.cv.ridge, newx= X[ - c.training.i,]) ridge.pred.train <- predict (c.model.ridge, s= bestlam.cv.ridge, newx= X[c.training.i,]) ridge.mse.test <- round ( mean ((ridge.pred.test - Y[ - c.training.i]) ^ 2 )) ridge.mse.train<- round ( mean ((ridge.pred.train - Y[c.training.i]) ^ 2 )) ridge.mse.test ## [1] 9899790 ridge.mse.train ## [1] 8729050 c) # Lasso Regression set.seed ( 1128 ) i= seq ( 10 , - 2 , length= 100 ) lambda.v= 10 ^ i length (lambda.v) ## [1] 100 #No missing Values, No factors, Predictors in Matrix, Response in Vector (Data Prep) X <- model.matrix (Expend ~ .,college) Y <- college $ Expend #Fit Lasso Regression Model library (glmnet) c.model.lasso <- glmnet (X,Y, alpha = 1 , lambda = lambda.v) # Alpha = 1 gives lasso plot (c.model.lasso) 4
0 100 200 300 400 500 -300 -200 -100 0 100 L1 Norm Coefficients 0 3 3 5 8 12 #Finding Optimal Lambda set.seed ( 1128 ) cv.out.lasso <- cv.glmnet (X,Y, alpha= 1 ) bestlam.cv.lasso <- cv.out.lasso $ lambda.min bestlam.cv.lasso ## [1] 6.35878 #Compute MSE of training lasso.pred <- predict (c.model.lasso, s= bestlam.cv.lasso, newx= X[c.training.i,]) lasso.mse.train <- round ( mean ((lasso.pred - Y[c.training.i]) ^ 2 )) lasso.mse.train ## [1] 8468312 #Compute MSE of test lasso.pred <- predict (c.model.lasso, s= bestlam.cv.lasso, newx= X[ - c.training.i,]) lasso.mse.test <- round ( mean ((lasso.pred - Y[ - c.training.i]) ^ 2 )) lasso.mse.test ## [1] 9448019 5
#Non-Zero lasso coefficients lasscoef <- predict (c.model.lasso, type= "coefficients" , s= bestlam.cv.lasso) lasscoef[lasscoef != 0 ] #Only one was killed ## <sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient ## [1] 6.097158e+03 7.023811e-01 -8.942004e-01 4.689058e-01 1.233637e+02 ## [6] -6.181335e+01 -2.247071e-02 5.172989e-03 5.228978e-01 9.943960e-01 ## [11] 2.108246e-01 3.287954e+00 2.960737e+01 -3.130042e+02 1.020996e+01 ## [16] -2.563767e+01 Question 2 a) #Using same college data set as above #PCR Model library (pls) ## ## Attaching package: ’pls’ ## The following object is masked from ’package:stats’: ## ## loadings library (ggplot2) library (ISLR) library (readr) str (college) ## ’data.frame’: 2000 obs. of 17 variables: ## $ Apps : int 1758 14463 838 1127 735 504 280 1373 1455 2379 ... ## $ Accept : int 1485 6166 651 884 423 482 143 1373 1064 2133 ... ## $ Enroll : int 419 1757 159 308 366 185 79 724 452 1292 ... ## $ Top10perc : int 27 60 11 30 20 10 5 6 1 8 ... ## $ Top25perc : int 58 94 25 64 48 36 27 21 16 25 ... ## $ F.Undergrad: int 2041 8544 654 1310 2448 550 327 2754 2632 4283 ... ## $ P.Undergrad: int 174 671 162 766 707 84 110 474 617 2973 ... ## $ Outstate : int 12040 6550 8640 11718 9210 9130 5590 2700 6806 4973 ... ## $ Room.Board : int 4100 4598 3700 7398 3782 3322 2900 2660 2550 3192 ... ## $ Books : int 600 700 400 450 700 450 650 540 350 500 ... ## $ Personal : int 1100 1000 1915 1800 1000 1450 1952 1660 766 1425 ... ## $ PhD : int 92 83 62 73 49 46 53 60 75 56 ... ## $ Terminal : int 96 100 62 87 51 51 63 68 75 65 ... ## $ S.F.Ratio : num 13.2 18 12.2 16.4 39.8 12.6 15.1 20.3 15.1 22 ... ## $ perc.alumni: int 17 15 13 33 15 25 4 29 10 21 ... ## $ Grad.Rate : int 72 80 48 76 34 54 90 52 24 38 ... ## $ Expend : int 9060 8055 7634 8871 6562 8686 4839 4550 6972 4078 ... 6
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
#Recall that PCR and PLS only take numerical information set.seed ( 1128 ) #PCR fit on training set pcr.fit <- pcr (Expend ~ ., data= college.train, scale= TRUE , validation= "CV" ) #Does: PC, Standarizing, Valida #Keeping the 85% threshold in the X matrix summary (pcr.fit) # 7 components explain 86% variance of X matrix and explain 59% of Expend ## Data: X dimension: 1400 16 ## Y dimension: 1400 1 ## Fit method: svdpc ## Number of components considered: 16 ## ## VALIDATION: RMSEP ## Cross-validated using 10 random segments. ## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps ## CV 5155 4289 3543 3458 3460 3347 3307 ## adjCV 5155 4288 3542 3457 3461 3346 3305 ## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps ## CV 3307 3219 3214 3209 3145 3128 3173 ## adjCV 3306 3217 3213 3207 3144 3126 3174 ## 14 comps 15 comps 16 comps ## CV 2993 2980 2962 ## adjCV 2990 2977 2958 ## ## TRAINING: % variance explained ## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps ## X 32.10 59.47 66.61 72.39 77.80 82.81 86.64 90.05 ## Expend 31.35 53.14 55.51 55.54 58.28 59.50 59.55 61.70 ## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps ## X 93.06 95.65 97.16 98.16 99.03 99.66 99.88 ## Expend 61.79 62.02 63.57 63.99 64.07 67.43 67.91 ## 16 comps ## X 100.00 ## Expend 68.26 validationplot (pcr.fit, val.type= "MSEP" ) 7
0 5 10 15 1.0e+07 2.0e+07 Expend number of components MSEP #Based on the plot, MSE is lowest when N components = 16, we use the full model. # M = 7 is the optimal Principal Components using CV, keeping 85% info of data #MSE for Training pcr.pred.train <- predict (pcr.fit,college.train, ncomp= 7 ) mean ((pcr.pred.train - college.train $ Expend) ^ 2 ) ## [1] 10734336 #MSE for Testing pcr.pred.test <- predict (pcr.fit,college.test, ncomp= 7 ) mean ((pcr.pred.test - college.test $ Expend) ^ 2 ) ## [1] 12435645 b) #PLS Model library (pls) library (ggplot2) library (ISLR) library (readr) set.seed ( 1128 ) #Fitting PLS model on training set 8
pls.fit <- plsr (Expend ~ ., data = college.train, scale= TRUE , validation = "CV" ) #Keeping 85% threshold of X matrix summary (pls.fit) # 9 Components explain 85% var of X and account for 68% of Expend ## Data: X dimension: 1400 16 ## Y dimension: 1400 1 ## Fit method: kernelpls ## Number of components considered: 16 ## ## VALIDATION: RMSEP ## Cross-validated using 10 random segments. ## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps ## CV 5155 3438 3261 3127 3053 3002 2983 ## adjCV 5155 3437 3260 3126 3050 2999 2980 ## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps ## CV 2973 2973 2970 2964 2964 2963 2963 ## adjCV 2970 2970 2966 2961 2961 2960 2960 ## 14 comps 15 comps 16 comps ## CV 2962 2963 2962 ## adjCV 2959 2959 2958 ## ## TRAINING: % variance explained ## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps ## X 30.38 50.25 64.89 69.15 72.77 75.99 78.11 82.18 ## Expend 55.97 60.55 63.97 65.97 67.22 67.73 67.97 68.03 ## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps ## X 84.23 86.64 90.96 92.93 95.78 97.52 99.13 ## Expend 68.13 68.21 68.23 68.24 68.25 68.25 68.26 ## 16 comps ## X 100.00 ## Expend 68.26 validationplot (pls.fit, val.type= "MSEP" ) 9
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
0 5 10 15 1.0e+07 2.0e+07 Expend number of components MSEP # This model shows a sharper drop in MSE # If we wish to stick to the 85% mark, 9 Comp is ok, but 6 or 7 can get the job done. #MSE for Training pls.pred.train <- predict (pls.fit,college.train, ncomp= 9 ) mean ((pls.pred.train - college.train $ Expend) ^ 2 ) ## [1] 8455719 #MSE for Testing pls.pred.test <- predict (pls.fit,college.test, ncomp= 9 ) mean ((pls.pred.test - college.test $ Expend) ^ 2 ) ## [1] 9633312 Question 3 a) #BIC Model m3full <- lm (Expend ~ ., data= college.train) summary (m3full) 10
## ## Call: ## lm(formula = Expend ~ ., data = college.train) ## ## Residuals: ## Min 1Q Median 3Q Max ## -8941 -1422 -263 819 32325 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 6160.27522 814.72305 7.561 7.24e-14 *** ## Apps 0.72830 0.07592 9.593 < 2e-16 *** ## Accept -0.95921 0.15307 -6.267 4.92e-10 *** ## Enroll 0.92317 0.43755 2.110 0.03505 * ## Top10perc 125.79354 11.61572 10.830 < 2e-16 *** ## Top25perc -59.04195 9.37418 -6.298 4.03e-10 *** ## F.Undergrad -0.12285 0.07404 -1.659 0.09730 . ## P.Undergrad 0.04972 0.05948 0.836 0.40333 ## Outstate 0.47114 0.03638 12.951 < 2e-16 *** ## Room.Board 0.05946 0.10301 0.577 0.56388 ## Books 0.87815 0.54049 1.625 0.10445 ## Personal 0.25173 0.13414 1.877 0.06078 . ## PhD 11.89817 9.39622 1.266 0.20563 ## Terminal 25.54205 10.29774 2.480 0.01324 * ## S.F.Ratio -334.15330 25.53823 -13.084 < 2e-16 *** ## perc.alumni 3.71509 8.50341 0.437 0.66226 ## Grad.Rate -23.34631 6.39241 -3.652 0.00027 *** ## --- ## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1 ## ## Residual standard error: 2920 on 1383 degrees of freedom ## Multiple R-squared: 0.6826, Adjusted R-squared: 0.6789 ## F-statistic: 185.9 on 16 and 1383 DF, p-value: < 2.2e-16 backwardBIC <- step (m3full, direction = "backward" , data = college.train, k = log ( nrow (college.train))) ## Start: AIC=22448.29 ## Expend ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + ## P.Undergrad + Outstate + Room.Board + Books + Personal + ## PhD + Terminal + S.F.Ratio + perc.alumni + Grad.Rate ## ## Df Sum of Sq RSS AIC ## - perc.alumni 1 1627593 1.1794e+10 22441 ## - Room.Board 1 2841112 1.1796e+10 22441 ## - P.Undergrad 1 5958766 1.1799e+10 22442 ## - PhD 1 13672457 1.1806e+10 22443 ## - Books 1 22509277 1.1815e+10 22444 ## - F.Undergrad 1 23474376 1.1816e+10 22444 ## - Personal 1 30029069 1.1823e+10 22445 ## - Enroll 1 37957696 1.1831e+10 22446 ## - Terminal 1 52459134 1.1845e+10 22447 ## <none> 1.1793e+10 22448 ## - Grad.Rate 1 113736655 1.1906e+10 22455 ## - Accept 1 334851190 1.2128e+10 22480 11
## - Top25perc 1 338258037 1.2131e+10 22481 ## - Apps 1 784746187 1.2578e+10 22531 ## - Top10perc 1 1000040366 1.2793e+10 22555 ## - Outstate 1 1430171436 1.3223e+10 22601 ## - S.F.Ratio 1 1459832373 1.3253e+10 22604 ## ## Step: AIC=22441.23 ## Expend ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + ## P.Undergrad + Outstate + Room.Board + Books + Personal + ## PhD + Terminal + S.F.Ratio + Grad.Rate ## ## Df Sum of Sq RSS AIC ## - Room.Board 1 2523617 1.1797e+10 22434 ## - P.Undergrad 1 6116451 1.1800e+10 22435 ## - PhD 1 13611529 1.1808e+10 22436 ## - Books 1 22366146 1.1817e+10 22437 ## - F.Undergrad 1 24634351 1.1819e+10 22437 ## - Personal 1 28993481 1.1823e+10 22437 ## - Enroll 1 40186508 1.1835e+10 22439 ## - Terminal 1 53644334 1.1848e+10 22440 ## <none> 1.1794e+10 22441 ## - Grad.Rate 1 112520334 1.1907e+10 22447 ## - Top25perc 1 337554910 1.2132e+10 22474 ## - Accept 1 339899172 1.2134e+10 22474 ## - Apps 1 783128499 1.2578e+10 22524 ## - Top10perc 1 1014680805 1.2809e+10 22550 ## - S.F.Ratio 1 1474494785 1.3269e+10 22599 ## - Outstate 1 1541042978 1.3335e+10 22606 ## ## Step: AIC=22434.29 ## Expend ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + ## P.Undergrad + Outstate + Books + Personal + PhD + Terminal + ## S.F.Ratio + Grad.Rate ## ## Df Sum of Sq RSS AIC ## - P.Undergrad 1 6901836 1.1804e+10 22428 ## - PhD 1 13164767 1.1810e+10 22429 ## - Books 1 23024051 1.1820e+10 22430 ## - F.Undergrad 1 24360553 1.1821e+10 22430 ## - Personal 1 28334659 1.1825e+10 22430 ## - Enroll 1 38607077 1.1836e+10 22432 ## - Terminal 1 57885442 1.1855e+10 22434 ## <none> 1.1797e+10 22434 ## - Grad.Rate 1 110313614 1.1907e+10 22440 ## - Accept 1 338529839 1.2135e+10 22467 ## - Top25perc 1 341379288 1.2138e+10 22467 ## - Apps 1 800038854 1.2597e+10 22519 ## - Top10perc 1 1016519256 1.2813e+10 22543 ## - S.F.Ratio 1 1476960637 1.3274e+10 22592 ## - Outstate 1 1893007841 1.3690e+10 22635 ## ## Step: AIC=22427.86 ## Expend ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + ## Outstate + Books + Personal + PhD + Terminal + S.F.Ratio + 12
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
## Grad.Rate ## ## Df Sum of Sq RSS AIC ## - PhD 1 15037633 1.1819e+10 22422 ## - F.Undergrad 1 19983387 1.1824e+10 22423 ## - Books 1 24844677 1.1829e+10 22424 ## - Personal 1 32429170 1.1836e+10 22425 ## - Enroll 1 38462741 1.1842e+10 22425 ## - Terminal 1 57529394 1.1861e+10 22427 ## <none> 1.1804e+10 22428 ## - Grad.Rate 1 121372989 1.1925e+10 22435 ## - Top25perc 1 341480664 1.2145e+10 22461 ## - Accept 1 357413950 1.2161e+10 22462 ## - Apps 1 823788120 1.2628e+10 22515 ## - Top10perc 1 1012778629 1.2817e+10 22536 ## - S.F.Ratio 1 1485696987 1.3290e+10 22587 ## - Outstate 1 1931208015 1.3735e+10 22633 ## ## Step: AIC=22422.4 ## Expend ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + ## Outstate + Books + Personal + Terminal + S.F.Ratio + Grad.Rate ## ## Df Sum of Sq RSS AIC ## - Books 1 19452131 1.1838e+10 22418 ## - F.Undergrad 1 19817037 1.1839e+10 22418 ## - Personal 1 32492881 1.1851e+10 22419 ## - Enroll 1 38016971 1.1857e+10 22420 ## <none> 1.1819e+10 22422 ## - Grad.Rate 1 118957244 1.1938e+10 22429 ## - Terminal 1 241369802 1.2060e+10 22444 ## - Top25perc 1 342446852 1.2161e+10 22455 ## - Accept 1 349420303 1.2168e+10 22456 ## - Apps 1 817922760 1.2637e+10 22509 ## - Top10perc 1 1077122885 1.2896e+10 22537 ## - S.F.Ratio 1 1471350640 1.3290e+10 22579 ## - Outstate 1 1943689663 1.3763e+10 22628 ## ## Step: AIC=22417.46 ## Expend ~ Apps + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + ## Outstate + Personal + Terminal + S.F.Ratio + Grad.Rate ## ## Df Sum of Sq RSS AIC ## - F.Undergrad 1 19601780 1.1858e+10 22413 ## - Enroll 1 37675603 1.1876e+10 22415 ## - Personal 1 42999327 1.1881e+10 22415 ## <none> 1.1838e+10 22418 ## - Grad.Rate 1 123307527 1.1962e+10 22425 ## - Terminal 1 248427716 1.2087e+10 22439 ## - Top25perc 1 336166585 1.2174e+10 22449 ## - Accept 1 353799856 1.2192e+10 22451 ## - Apps 1 829700495 1.2668e+10 22505 ## - Top10perc 1 1089888226 1.2928e+10 22534 ## - S.F.Ratio 1 1460859872 1.3299e+10 22573 ## - Outstate 1 1954974285 1.3793e+10 22624 13
## ## Step: AIC=22412.53 ## Expend ~ Apps + Accept + Enroll + Top10perc + Top25perc + Outstate + ## Personal + Terminal + S.F.Ratio + Grad.Rate ## ## Df Sum of Sq RSS AIC ## - Enroll 1 19013534 1.1877e+10 22408 ## - Personal 1 36489170 1.1894e+10 22410 ## <none> 1.1858e+10 22413 ## - Grad.Rate 1 114076513 1.1972e+10 22419 ## - Terminal 1 236097589 1.2094e+10 22433 ## - Accept 1 343184224 1.2201e+10 22445 ## - Top25perc 1 363640759 1.2222e+10 22448 ## - Apps 1 810806804 1.2669e+10 22498 ## - Top10perc 1 1150513489 1.3008e+10 22535 ## - S.F.Ratio 1 1486356412 1.3344e+10 22571 ## - Outstate 1 2015806516 1.3874e+10 22625 ## ## Step: AIC=22407.53 ## Expend ~ Apps + Accept + Top10perc + Top25perc + Outstate + Personal + ## Terminal + S.F.Ratio + Grad.Rate ## ## Df Sum of Sq RSS AIC ## - Personal 1 44140991 1.1921e+10 22406 ## <none> 1.1877e+10 22408 ## - Grad.Rate 1 119437764 1.1996e+10 22414 ## - Terminal 1 244851195 1.2122e+10 22429 ## - Top25perc 1 380438087 1.2257e+10 22444 ## - Accept 1 437983009 1.2315e+10 22451 ## - Apps 1 792219767 1.2669e+10 22491 ## - Top10perc 1 1308098232 1.3185e+10 22547 ## - S.F.Ratio 1 1471314303 1.3348e+10 22564 ## - Outstate 1 2096806680 1.3974e+10 22628 ## ## Step: AIC=22405.48 ## Expend ~ Apps + Accept + Top10perc + Top25perc + Outstate + Terminal + ## S.F.Ratio + Grad.Rate ## ## Df Sum of Sq RSS AIC ## <none> 1.1921e+10 22406 ## - Grad.Rate 1 147988977 1.2069e+10 22416 ## - Terminal 1 248745576 1.2170e+10 22427 ## - Top25perc 1 390929485 1.2312e+10 22443 ## - Accept 1 424613672 1.2346e+10 22447 ## - Apps 1 800453200 1.2722e+10 22489 ## - Top10perc 1 1347346841 1.3268e+10 22548 ## - S.F.Ratio 1 1557236483 1.3478e+10 22570 ## - Outstate 1 2062690562 1.3984e+10 22622 BICmodel <- lm (Expend ~ Apps + Accept + Top10perc + Top25perc + Outstate + Terminal + S.F.Ratio + Grad.Rate, college.train) #MSE for Training pred.bic.train <- predict (BICmodel,college.train) mse.bic.train <- mean ((college.train $ Expend - pred.bic.train) ^ 2 ) 14
#MSe for Testing pred.bic.test <- predict (BICmodel,college.test) mse.bic.test <- mean ((college.test $ Expend - pred.bic.test) ^ 2 ) mse.bic.test ## [1] 9701061 b) library (gam) ## Loading required package: splines ## Loading required package: foreach ## Loaded gam 1.20 #Fit GAM on Training using BIC predictors from (a) set.seed ( 1128 ) gam1 <- gam (Expend ~ Apps + Accept + Enroll + Top10perc + Top25perc + Outstate + Terminal + S.F.Ratio + Grad.Rate, family = gaussian (), data = college.train) #MSE of Training predict.gam.train <- predict (gam1, newdata = college.train) mse.gam.train <- mean ((college.train $ Expend - predict.gam.train) ^ 2 ) mse.gam.train ## [1] 8495993 #MSE of Testing predict.gam.test <- predict (gam1, newdata = college.test) mse.gam.test <- mean ((college.test $ Expend - predict.gam.test) ^ 2 ) mse.gam.test ## [1] 9636946 gplot (gam1) #Plot par ( mfrow= c ( 3 , 3 )) plot.Gam (gam1, se= TRUE , col= "red" ) 15
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
0 20000 40000 0 Apps partial for Apps 0 10000 20000 -30000 Accept partial for Accept 0 2000 4000 6000 0 Enroll partial for Enroll 0 20 40 60 80 0 Top10perc partial for Top10perc 20 40 60 80 100 -2000 Top25perc partial for Top25perc 5000 15000 -4000 Outstate partial for Outstate 40 60 80 100 -2000 Terminal partial for Terminal 10 20 30 40 -10000 S.F.Ratio partial for S.F.Ratio 20 40 60 80 100 -2000 Grad.Rate partial for Grad.Rate Based on the outcome of the GAM model, out of the 9 BIC predictors, Accept, Enroll, Termina, and Grad.Rate are the most troublesome predictors when ploted against response variable. c) gam1t <- gam (Expend ~ Apps + Accept + Enroll + Top10perc + Top25perc + Outstate + Terminal + S.F.Ratio + Grad.Rate, family = gaussian (), data = college.test) par ( mfrow= c ( 3 , 3 )) plot.Gam (gam1t, se= TRUE , col= "red" ) #Same as above 16
0 5000 15000 0 Apps partial for Apps 0 5000 10000 15000 -20000 Accept partial for Accept 0 2000 4000 6000 0 Enroll partial for Enroll 0 20 40 60 80 -4000 Top10perc partial for Top10perc 20 40 60 80 100 -4000 Top25perc partial for Top25perc 5000 15000 -4000 Outstate partial for Outstate 40 60 80 100 -3000 Terminal partial for Terminal 10 20 30 40 -8000 S.F.Ratio partial for S.F.Ratio 20 40 60 80 120 -2000 Grad.Rate partial for Grad.Rate d) The most notable non linear relationships are with Enroll, Accept, Apps and Terminal as their information does not follows the response variable Expend Question 4 Based on these models, it seems like our predictions will output good predictions but they can be further improved with transformations. Although the MSE is quite high, all models attained the lowest MSE possible. Their error rates were quite similar as they all fell in the (800000 - 11000000) range except for my PCR model. #1. Least Squares #MSE Test Train # 9658396 8423397 #2. Ridge model with the best lambda #MSE Test Train # 9899790 8729050 #3. Lasso model with the best lambda #MSE Test Train # 9448019 8468312 17
#4. PCR model (Worst Model) #MSE Test Train # 12435645 10734336 #5. PLS model #MSE Test Train # 9633312 8455719 #6. BIC model #MSE Test Train # 9701061 8515039 #7. GAM #MSE Test Train # 9636946 8495993 18
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help