# PCA with a drinking habits data set, sadly, source of the data is unknown - if you have information on the source please inform Mette.Langaas@math.ntnu.no # p=6 variables: the (average pr inhabitant?) consumption of coffee, tea, cocoa, liqueur, wine, beer # n=24 observations from 24 different countries drink <- read.csv("http://www.math.ntnu.no/~bakke/TMA4267/2015V/drikke.TXT",sep=",",header=TRUE) drink # missing data on three countries- why? drink <- na.omit(drink) drink # now for PCA ?prcomp pca <- prcomp(drink,scale=TRUE) # scale: variables are scaled # coefficients of PCs, called rotations or loadings pca$rotation # done "manually": s <- cor(drink) # cor, not cov, since covariates are scaled s cov(scale(drink)) # the same eigen(s) # same as pcs$rotations - opposite sign of some vectors # sample variances of the PCs are the eigenvalues of s pca$sdev pca$sdev^2 # compare with eigenvalues above # scores - values of the PCs pca$x cov(pca$x) # offdiag=0, ondiag=pca$sdev^2 pca$sdev^2 # plot the scores of two first PCs against each other plot(pca$x[,1],pca$x[,2],type="n") text(pca$x[,1],pca$x[,2],rownames(drink),cex=0.6) # The same with also loading of two first PCs plotted biplot(pca,scale=0,cex=0.6) # scale=0: arrows scaled to represent the loadings # Great Britain has large PC1, for which tea is influental. # Italy has large negative PC1, for which wine is influental. # The Netherlands has large PC2, for which coffee and tea is influental. # East European countries has large negative PC2, for which liqueur (should be liquor = spirits?) # the same for other pairs of PCs: biplot(pca,choices=c(1,3),scale=0,cex=0.6) # Ireland: Much tea, little liquor, Italy: Much wine, little liquor biplot(pca,choices=c(2,3),scale=0,cex=0.6) # Poland: Much liqour, little cocoa or coffee biplot(pca,choices=c(1,4),scale=0,cex=0.6) biplot(pca,choices=c(1,5),scale=0,cex=0.6) # How many PCs do we need to capture a large part of the variability in the data? summary(pca) # 4 to get 84% plot(pca) # screeplot - says 4? # the effect of scaling pca.noscale <- prcomp(drink,scale=FALSE) summary(pca.noscale) # 1 or 2 PC enough biplot(pca.noscale,scale=0) # beer and wine dominate # Now, turn to PCR - principal component regression # Acid rain data set: pH in Nowegian lakes (y) vs. sulfate, nitrate, calcium, aluminium and organic content (x1, ... x5), area of lake (x6) and location (x7 = 0, Telemark, or x7 = 1, Trøndelag)? Data from Statens forurensningstilsyn (1986). Here 26 random lakes from Telemark and Trøndelag out of 1005 lakes acidrain <- read.table("http://www.math.ntnu.no/~mettela/TMA4267/Data/acidrain.txt",header=TRUE) pcaacid <- prcomp(acidrain[,-1],scale=TRUE) # remove y column biplot(pcaacid,scale=0) plot(pcaacid) # 4 PCs? summary(pcaacid) # 3 PCs? pcaacid$rotation z <- pcaacid$x[,1:3] z fit <- lm(y~z,data=acidrain) # only y used from acidrain summary(fit) # for interpretation, back to original data betas <- pcaacid$rotation[,1:3]%*%matrix(fit$coeff[2:4],ncol=1) # first matrix transpose of Phi betas # compared to the findings for the full model, remember that the xs were scaled #x <- apply(acidrain[,-1],2,scale) x <- scale(acidrain[,-1]) #y <- ds[,1] fullfit <- lm(y~x,data=acidrain) summary(fullfit) plot(fullfit$coeff[-1],betas) abline(0,1) # line, intercept 0, slope 1 cbind(fullfit$coeff[-1],betas)