# PCA with a drinking habits data set, sadly, source of the data is unknown - if you have information on the source please inform Mette.Langaas@math.ntnu.no
# p=6 variables: the (average pr inhabitant?) consumption of coffee, tea, cocoa, liqueur, wine, beer
# n=24 observations from 24 different countries

drink <- read.csv("http://www.math.ntnu.no/~bakke/TMA4267/2015V/drikke.TXT",sep=",",header=TRUE)
drink
# missing data on three countries- why?
drink <- na.omit(drink)
drink

# now for PCA
?prcomp
pca <- prcomp(drink,scale=TRUE) # scale: variables are scaled

# coefficients of PCs, called rotations or loadings
pca$rotation

# done "manually":
s <- cor(drink) # cor, not cov, since covariates are scaled
s
cov(scale(drink)) # the same
eigen(s) # same as pcs$rotations - opposite sign of some vectors

# sample variances of the PCs are the eigenvalues of s
pca$sdev
pca$sdev^2 # compare with eigenvalues above

# scores - values of the PCs
pca$x
cov(pca$x) # offdiag=0, ondiag=pca$sdev^2
pca$sdev^2

# plot the scores of two first PCs against each other
plot(pca$x[,1],pca$x[,2],type="n")
text(pca$x[,1],pca$x[,2],rownames(drink),cex=0.6)
# The same with also loading of two first PCs plotted
biplot(pca,scale=0,cex=0.6) # scale=0: arrows scaled to represent the loadings
# Great Britain has large PC1, for which tea is influental.
# Italy has large negative PC1, for which wine is influental.
# The Netherlands has large PC2, for which coffee and tea is influental.
# East European countries has large negative PC2, for which liqueur (should be liquor = spirits?)

# the same for other pairs of PCs:
biplot(pca,choices=c(1,3),scale=0,cex=0.6) # Ireland: Much tea, little liquor, Italy: Much wine, little liquor
biplot(pca,choices=c(2,3),scale=0,cex=0.6) # Poland: Much liqour, little cocoa or coffee
biplot(pca,choices=c(1,4),scale=0,cex=0.6)
biplot(pca,choices=c(1,5),scale=0,cex=0.6)

# How many PCs do we need to capture a large part of the variability in the data?
summary(pca) # 4 to get 84%
plot(pca) # screeplot - says 4?

# the effect of scaling
pca.noscale <- prcomp(drink,scale=FALSE)
summary(pca.noscale) # 1 or 2 PC enough
biplot(pca.noscale,scale=0)
# beer and wine dominate

# Now, turn to PCR - principal component regression
# Acid rain data set: pH in Nowegian lakes (y) vs. sulfate, nitrate, calcium, aluminium and organic content (x1, ... x5), area of lake (x6) and location (x7 = 0, Telemark, or x7 = 1, Trøndelag)? Data from Statens forurensningstilsyn (1986). Here 26 random lakes from Telemark and Trøndelag out of 1005 lakes 

acidrain <- read.table("http://www.math.ntnu.no/~mettela/TMA4267/Data/acidrain.txt",header=TRUE)

pcaacid <- prcomp(acidrain[,-1],scale=TRUE) # remove y column
biplot(pcaacid,scale=0)
plot(pcaacid) # 4 PCs?
summary(pcaacid) # 3 PCs?
pcaacid$rotation

z <- pcaacid$x[,1:3]
z
fit <- lm(y~z,data=acidrain) # only y used from acidrain
summary(fit)

# for interpretation, back to original data
betas <- pcaacid$rotation[,1:3]%*%matrix(fit$coeff[2:4],ncol=1) # first matrix transpose of Phi
betas

# compared to the findings for the full model, remember that the xs were scaled
#x <- apply(acidrain[,-1],2,scale)
x <- scale(acidrain[,-1])
#y <- ds[,1]
fullfit <- lm(y~x,data=acidrain)
summary(fullfit)
plot(fullfit$coeff[-1],betas)
abline(0,1) # line, intercept 0, slope 1
cbind(fullfit$coeff[-1],betas)