## On lecture 14 April 2015 # How does pH in Nowegian lakes depend on sulfate, nitrate, calcium, aluminium and organic content (x1, ... x5), area of lake (x6) and location (x7 = 0, Telemark, or x7 = 1, Trøndelag)? Data from Statens forurensningstilsyn (1986). Here 26 random lakes from Telemark and Trøndelag out of 1005 lakes have been drawn acidrain <- read.table("http://www.math.ntnu.no/~mettela/TMA4267/Data/acidrain.txt",header=TRUE) # Fitted full model previously: fitfull <- lm(y~.,data=acidrain) # lm: linear model summary(fitfull) # Ridge library(glmnet) x <- model.matrix(y~.-1,data=acidrain) # easy way to get the design matrix X y <- acidrain$y grid <- 10^seq(-4,2,length=100) # smaller lambda than default to match small coefficients in model ridge <- glmnet(x,y,alpha=0,lambda=grid) # alpha=0 gives ridge plot(ridge,xvar="lambda",label=TRUE) # how parameter estimates vary with lambda ridge$lambda # note that the grid is turned around fitfull$coeff # least squares estimates coef(ridge)[,100] # lambda approx 0 - same as ls estimates cvridge <- cv.glmnet(x,y,alpha=0,lambda=grid) # cross-validation cvridge$cvm # mean cross-validated error - one for each lambda # cvridge$lambda minlambda <- cvridge$lambda.min minindex <- which.min(cvridge$cvm) cvridge$lambda[minindex] selambda <- cvridge$lambda.1se # lambda.min: value of 'lambda' that gives minimum cvm # lambda.1se: largest value of 'lambda' such that error is within 1 standard error of the minimum. plot(cvridge) # coef(cvridge) coef(ridge,s=minlambda) coef(ridge,s=selambda) # Run previous lines several times. Different results. # Now we fit a lasso model; for this we use the default alpha=1 lasso <- glmnet(x,y) # default grid OK this time plot(lasso,xvar="lambda",label=TRUE) cvlasso=cv.glmnet(x,y) minlambda <- cvlasso$lambda.min selambda <- cvlasso$lambda.1se plot(cvlasso) coef(cvlasso) coef(lasso,s=minlambda) coef(lasso,s=selambda) # Run 7 previous lines several times. Different results.