## On lecture 14 April 2015
# How does pH in Nowegian lakes depend on sulfate, nitrate, calcium, aluminium and organic content (x1, ... x5), area of lake (x6) and location (x7 = 0, Telemark, or x7 = 1, Trøndelag)? Data from Statens forurensningstilsyn (1986). Here 26 random lakes from Telemark and Trøndelag out of 1005 lakes have been drawn 

acidrain <- read.table("http://www.math.ntnu.no/~mettela/TMA4267/Data/acidrain.txt",header=TRUE)

# Fitted full model previously:
fitfull <- lm(y~.,data=acidrain) # lm: linear model
summary(fitfull)

# Ridge

library(glmnet)
x <- model.matrix(y~.-1,data=acidrain) # easy way to get the design matrix X
y <- acidrain$y

grid <- 10^seq(-4,2,length=100) # smaller lambda than default to match small coefficients in model
ridge <- glmnet(x,y,alpha=0,lambda=grid) # alpha=0 gives ridge
plot(ridge,xvar="lambda",label=TRUE) # how parameter estimates vary with lambda
ridge$lambda # note that the grid is turned around
fitfull$coeff # least squares estimates
coef(ridge)[,100] # lambda approx 0 - same as ls estimates

cvridge <- cv.glmnet(x,y,alpha=0,lambda=grid) # cross-validation
cvridge$cvm # mean cross-validated error - one for each lambda
# cvridge$lambda
minlambda <- cvridge$lambda.min
minindex <- which.min(cvridge$cvm)
cvridge$lambda[minindex]
selambda <- cvridge$lambda.1se
# lambda.min: value of 'lambda' that gives minimum cvm
# lambda.1se: largest value of 'lambda' such that error is within 1 standard error of the minimum.
plot(cvridge)
# coef(cvridge)
coef(ridge,s=minlambda)
coef(ridge,s=selambda)
# Run previous lines several times. Different results.

# Now we fit a lasso model; for this we use the default alpha=1

lasso <- glmnet(x,y) # default grid OK this time
plot(lasso,xvar="lambda",label=TRUE)

cvlasso=cv.glmnet(x,y)
minlambda <- cvlasso$lambda.min
selambda <- cvlasso$lambda.1se
plot(cvlasso)
coef(cvlasso)
coef(lasso,s=minlambda)
coef(lasso,s=selambda)
# Run 7 previous lines several times. Different results.