This article contains the R code for a brief analysis of the "hills" data from the R library "MASS", as well as a (german) commentary. The data set is composed of the record times and related key data for 35 scottish hill races,
which took place in the year 1984.
While the first part focuses on analyzing the data itself, the second part is a comparison of linear regression models, using ANOVA and stepwise orthogonal decomposition.
Download(R code as .zip).
#Knowledgedump.org - Analyse der "hills"-Daten aus dem Paket "MASS" #1) Laden & allgemeine Betrachtung der Daten library(MASS) hills #Fuer ersten Blick auf die Daten.. help(hills) #Informationen zu den Attributen/Datenmenge str(hills) #Liefert kurzen Ueberblick summary(hills) #Liefert grobe Zusammenfassung der Attributwerte oldpar <- par(mfrow=c(1,2)) plot(hills$time,hills$dist,xlab="Bestzeit in Minuten", ylab="Distanz in Meilen",main="Zeit gg. Distanz") plot(hills$time,hills$climb,xlab="Bestzeit in Minuten", ylab="Anstieg in Fuss",main="Zeit gg. Anstieg") #2. Lineare Regression lmhills <- lm( time ~ dist + climb, data = hills) #definieren des Objekts der Klasse "lm" summary(lmhills) par(mfrow=c(1,1)) plot(lmhills, which =1) rownames(hills) hills[18,] hills2<-hills hills2[18,3] <- 18.65 lmhills2 <- lm( time ~ dist + climb, data = hills2) lmhills2 plot(lmhills2, which =1) hills[7,] par(mfrow=c(1,1)) plot(lmhills2, which =1) leverage<-hatvalues(lmhills2) leverage stdresiduals<- stdres(lmhills2) par(mfrow=c(1,2)) plot(leverage,stdresiduals,xlab="standardisierte Residuen", ylab="Hebelgewicht",main="Hebelgewicht gg. Residuen") plot(lmhills2, which=5) par(mfrow=c(1,1)) rstud<-rstudent(lmhills2) rstud plot(rstudent(lmhills2)) text(7,rstud[7],rownames(hills)[7],pos=1) par(mfrow=c(1,2)) dff<-dffits(lmhills2) dff plot(dff) text(7,dff[7],rownames(hills)[7],pos=1) cooks<-cooks.distance(lmhills2) cooks plot(cooks) text(7,cooks[7],rownames(hills)[7],pos=1) par(mfrow=c(2,2)) hills3<-hills2[-7,] lmhills3<-lm(time ~ dist + climb, data = hills3) summary(lmhills3) plot(lmhills2, which=c(1,5)) plot(lmhills3, which=c(1,5)) par(mfrow=c(1,1)) plot(hatvalues(lmhills3), main="Hebelgewichte nach Index", ylab="Hebelgewicht", xlab="Index") identify(hatvalues(lmhills3)) plot(residuals(lmhills3), main="Residuen nach Index", ylab="Residuum", xlab="Index") identify(residuals(lmhills3))
#Knowledgedump.org - Vergleich verschiedener linearer Regressionsmodelle fuer die "hills"-Daten: # time ~ 1 # time ~ 1 + dist # time ~ 1 + dist + climb #1 Visualisierung der Modelleigenschaften library(MASS) #source("C:\\..............\\plot.lm2.R") ##plot.lm2 laden! lm1<-lm(time~1, data=hills, y=TRUE) lm1d<-lm(time~1+dist, data=hills, y=TRUE) lm1dc<-lm(time~1+dist+climb, data=hills,y=TRUE) summary(lm1) summary(lm1d) summary(lm1dc) par(mfrow=c(1,2)) plot.lm2(lm1, which=1:2) par(mfrow=c(1,3)) plot.lm2(lm1,which=1) plot.lm2(lm1d,which=1) plot.lm2(lm1dc,which=1) par(mfrow=c(1,3)) plot.lm2(lm1,which=2) plot.lm2(lm1d,which=2) plot.lm2(lm1dc,which=2) par(mfrow=c(1,3)) plot.lm2(lm1,which=3) plot.lm2(lm1d,which=3) plot.lm2(lm1dc,which=3) par(mfrow=c(1,3)) plot.lm2(lm1,which=4) plot.lm2(lm1d,which=4) plot.lm2(lm1dc,which=4) par(mfrow=c(1,3)) plot.lm2(lm1,which=5) ##Alle Hutwerte identisch -> kein plot plot.lm2(lm1d,which=5) plot.lm2(lm1dc,which=5) par(mfrow=c(1,3)) plot.lm2(lm1,which=6) plot.lm2(lm1d,which=6) plot.lm2(lm1dc,which=6) par(mfrow=c(1,3)) plot.lm2(lm1,which=7) ##Ranked Fit - Vorsicht Rundungsfehler! plot.lm2(lm1d,which=7) plot.lm2(lm1dc,which=7) par(mfrow=c(1,3)) plot.lm2(lm1,which=7, rank.type="response") plot.lm2(lm1d,which=7, rank.type="response") plot.lm2(lm1dc,which=7, rank.type="response") par(mfrow=c(1,2)) plot(rank(hills[,2]),studres(lm1d), xlab="ranked climb", ylab="Stud. Res.", main="Stud. Res. gg. ranked climb", sub="Modell lm1d") panel.smooth(rank(hills[,2]),studres(lm1d)) plot(rank(hills[,2]),studres(lm1dc), xlab="ranked climb", ylab="Stud. Res.", main="Stud. Res. gg. ranked climb", sub="Modell lm1dc") panel.smooth(rank(hills[,2]),studres(lm1dc)) #2 Modellwahl mit Hilfe von Varianzanalyse (ANOVA) anova(lm1dc) #3 Modellwahl mit Schrittweiser Orthogonalzerlegung #lm1 offensichtlich dem Modell lm1d unterlegen -> was ist mit lm1d gg. 1m1dc? par(mfrow=c(1,1)) lmc.d<-lm(climb~dist, data=hills) plot(hills$dist, hills$climb, xlab="Distanz",ylab="Anstieg", main="Distanz vs. Anstieg") abline(lmc.d) lm1d.orth<-lm(lm1d$residuals ~ 0 + lmc.d$residuals, data=hills) #Orthokomplement zu lm1d summary(lm1d.orth)
# Knowledgedump.org - adjusted plot.lm() function # Adds some functionalities to the default plot.lm() function, # such as the use of studentized residuals instead of normal, two new plot types # and an option to highlight subsets of the data. # File src/library/stats/R/plot.lm.R # Part of the R package, http://www.R-project.org # # Copyright (C) 1995-2015 The R Core Team # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # http://www.r-project.org/Licenses/ ###ANMERKUNG: Mit # "eingeklammerte" oder rechts markierte Zeilen wurden modifiziert oder neu geschrieben plot.lm2 <- function (x, which = c(1L:3L,5L), ## was which = 1L:4L, caption = list("Std. Residuals vs Fitted", "Normal Q-Q", ################################# "Scale-Location", "Cook's distance", "Residuals vs Leverage", expression("Cook's dist vs Leverage " * h[ii] / (1 - h[ii]))), panel = if(add.smooth) panel.smooth else points, sub.caption = NULL, main = "", ask = prod(par("mfcol")) < length(which) && dev.interactive(), ..., id.n = 3, labels.id = names(residuals(x)), cex.id = 0.75, qqline = TRUE, cook.levels = c(0.5, 1.0), add.smooth = getOption("add.smooth"), label.pos = c(4,2), cex.caption = 1, cex.oma.main = 1.25, used.resp = NULL, rank.type="fit", scope = NULL, subset = NULL, ############################ subs.col = "blue", subs.type = "o")############################################# { dropInf <- function(x, h) { if(any(isInf <- h >= 1.0)) { warning(gettextf("not plotting observations with leverage one:\n %s", paste(which(isInf), collapse=", ")), call. = FALSE, domain = NA) x[isInf] <- NaN } x } if (!inherits(x, "lm")) stop("use only with \"lm\" objects") if(!is.numeric(which) || any(which < 1) || any(which > 8))########################### stop("'which' must be in 1:8")##################################################### isGlm <- inherits(x, "glm") show <- rep(FALSE, 8)################################################################ show[which] <- TRUE r <- residuals(x) yh <- predict(x) # != fitted() for glm w <- weights(x) if(!is.null(w)) { # drop obs with zero wt: PR#6640 wind <- w != 0 r <- r[wind] yh <- yh[wind] w <- w[wind] labels.id <- labels.id[wind] } n <- length(r) if (any(show[1L:6L])) { ################################################### s <- if (inherits(x, "rlm")) x$s else if(isGlm) sqrt(summary(x)$dispersion) else sqrt(deviance(x)/df.residual(x)) hii <- lm.influence(x, do.coef = FALSE)$hat if (any(show[4L:6L])) { cook <- if (isGlm) cooks.distance(x) else cooks.distance(x, sd = s, res = r) } } if (any(show[1L:3L])) { ################################################## ylab23 <- if(isGlm) "Std. deviance resid." else "Standardized residuals" r.w <- if (is.null(w)) r else sqrt(w) * r ## NB: rs is already NaN if r=0, hii=1 rs <- dropInf( r.w/(s * sqrt(1 - hii)), hii ) } if (any(show[5L:6L])) { # using 'leverages' r.hat <- range(hii, na.rm = TRUE) # though should never have NA isConst.hat <- all(r.hat == 0) || diff(r.hat) < 1e-10 * mean(hii, na.rm = TRUE) } if (any(show[c(1L, 3L)])) l.fit <- if (isGlm) "Predicted values" else "Fitted values" if (is.null(id.n)) id.n <- 0 else { id.n <- as.integer(id.n) if(id.n < 0L || id.n > n) stop(gettextf("'id.n' must be in {1,..,%d}", n), domain = NA) } ###################################################################################### # # if (show[7L]) { if (all(rank.type!=c("fit", "response"))) {stop("rank.type must be 'fit' or 'response'")} if (rank.type == "response") { if (is.null(used.resp)) { if (is.null(x$y)) {stop("Used response values can't be NULL.")} ranked.data <- rank(x$y) } else { if (!is.null(x$y)) {if (any(used.resp!=x$y)) {stop("used.resp not equal to response values saved in the linear model.")} } if (!is.numeric(used.resp)) {stop("used.resp must be of numeric type.")} if (!is.null(dim(used.resp))) {warning("used.resp is not a vector.")} ranked.data <- rank(used.resp) } } else {ranked.data <- rank(x$fit)} STUD <- rstudent(x) if (length(levels(factor(ranked.data)))!=n) {warning("Some values in ranked data are identical")} ##rounding can lead to mistakes in ranked data if (length(ranked.data)!=n) {stop("Wrong amount of used response values.")} } # # ##################################################################################### ##################################################################################### # # if (show[8L]) { if (is.null(scope)) {stop("The scope formula can't be NULL.")} if (!is.language(scope)) {stop("scope must be of language type")} ad1 <- add1(x,scope) } # # ##################################################################################### ##################################################################################### # # if (!is.null(subset)){ if(length(subset)>n) {stop("subset must be smaller than used data")} if(!is.numeric(subset)) {stop("subset must be of numeric type")} if(all(subs.col!=colors())) {stop("invalid subset color")} if(all(subs.type!=c("p","l","b","c","o","s","S","h","n"))) {stop("invalid subset type")} } # # ##################################################################################### if(id.n > 0L) { ## label the largest residuals if(is.null(labels.id)) labels.id <- paste(1L:n) iid <- 1L:id.n show.r <- sort.list(abs(r), decreasing = TRUE)[iid] if(show[1L]) {show.STDR <- sort.list(abs(rs), decreasing = TRUE)[iid]}############################################### if(show[7L]) {show.STUD <- sort.list(abs(STUD), decreasing = TRUE)[iid]}############################################## if(any(show[2L:3L])) show.rs <- sort.list(abs(rs), decreasing = TRUE)[iid] text.id <- function(x, y, ind, adj.x = TRUE) { labpos <- if(adj.x) label.pos[1+as.numeric(x > mean(range(x)))] else 3 text(x, y, labels.id[ind], cex = cex.id, xpd = TRUE, pos = labpos, offset = 0.25) } } getCaption <- function(k) # allow caption = "" , plotmath etc if(length(caption) < k) NA_character_ else as.graphicsAnnot(caption[[k]]) if(is.null(sub.caption)) { ## construct a default: cal <- x$call if (!is.na(m.f <- match("formula", names(cal)))) { cal <- cal[c(1, m.f)] names(cal)[2L] <- "" # drop " formula = " } cc <- deparse(cal, 80) # (80, 75) are ``parameters'' nc <- nchar(cc[1L], "c") abbr <- length(cc) > 1 || nc > 75 sub.caption <- if(abbr) paste(substr(cc[1L], 1L, min(75L, nc)), "...") else cc[1L] } one.fig <- prod(par("mfcol")) == 1 if (ask) { oask <- devAskNewPage(TRUE) on.exit(devAskNewPage(oask)) } ##---------- Do the individual plots : ---------- ############################################################################################ # # if (show[1L]) { ylim <- range(rs, na.rm=TRUE) if(id.n > 0) ylim <- extendrange(r = ylim, f = 0.08) dev.hold() plot(yh, rs, xlab = l.fit, ylab = "Std. Residuals", main = main, ylim = ylim, type = "n", ...) panel(yh, rs, ...) if(!is.null(subset)){ panel(yh[subset], rs[subset], col.smooth = subs.col, type=subs.type, pch=8) } if (one.fig) title(sub = sub.caption, ...) mtext(getCaption(1), 3, 0.25, cex = cex.caption) if(id.n > 0) { y.id <- rs[show.STDR] y.id[y.id < 0] <- y.id[y.id < 0] - strheight(" ")/3 text.id(yh[show.STDR], y.id, show.STDR) } abline(h = 0, lty = 3, col = "gray") dev.flush() } # # ##################################################################################### if (show[2L]) { ## Normal ylim <- range(rs, na.rm=TRUE) ylim[2L] <- ylim[2L] + diff(ylim) * 0.075 dev.hold() qq <- qqnorm(rs, main = main, ylab = ylab23, ylim = ylim, ...) if (qqline) qqline(rs, lty = 3, col = "gray50") if (one.fig) title(sub = sub.caption, ...) mtext(getCaption(2), 3, 0.25, cex = cex.caption) ##################################################################################### # # if(!is.null(subset)){ panel(qq$x[subset],rs[subset],type="n", pch=8) } # # ##################################################################################### if(id.n > 0) text.id(qq$x[show.rs], qq$y[show.rs], show.rs) dev.flush() } if (show[3L]) { sqrtabsr <- sqrt(abs(rs)) ylim <- c(0, max(sqrtabsr, na.rm=TRUE)) yl <- as.expression(substitute(sqrt(abs(YL)), list(YL=as.name(ylab23)))) yhn0 <- if(is.null(w)) yh else yh[w!=0] dev.hold() plot(yhn0, sqrtabsr, xlab = l.fit, ylab = yl, main = main, ylim = ylim, type = "n", ...) panel(yhn0, sqrtabsr, ...) ##################################################################################### # # if(!is.null(subset)){ panel(yhn0[subset], sqrtabsr[subset], col.smooth = subs.col, type=subs.type, pch=8) } # # ##################################################################################### if (one.fig) title(sub = sub.caption, ...) mtext(getCaption(3), 3, 0.25, cex = cex.caption) if(id.n > 0) text.id(yhn0[show.rs], sqrtabsr[show.rs], show.rs) dev.flush() } if (show[4L]) { ## Cook's Distances if(id.n > 0) { show.r <- order(-cook)[iid]# index of largest 'id.n' ones ymx <- cook[show.r[1L]] * 1.075 } else ymx <- max(cook, na.rm = TRUE) dev.hold() plot(cook, type = "h", ylim = c(0, ymx), main = main, xlab = "Obs. number", ylab = "Cook's distance", ...) ##################################################################################### # # if(!is.null(subset)){ panel(subset, cook[subset], col.smooth = subs.col, type="n", pch=8) } # # ##################################################################################### if (one.fig) title(sub = sub.caption, ...) mtext(getCaption(4), 3, 0.25, cex = cex.caption) if(id.n > 0) text.id(show.r, cook[show.r], show.r, adj.x=FALSE) dev.flush() } if (show[5L]) { ylab5 <- if (isGlm) "Std. Pearson resid." else "Standardized residuals" r.w <- residuals(x, "pearson") if(!is.null(w)) r.w <- r.w[wind] # drop 0-weight cases rsp <- dropInf( r.w/(s * sqrt(1 - hii)), hii ) ylim <- range(rsp, na.rm = TRUE) if (id.n > 0) { ylim <- extendrange(r = ylim, f = 0.08) show.rsp <- order(-cook)[iid] } do.plot <- TRUE if(isConst.hat) { ## leverages are all the same if(missing(caption)) # set different default caption[[5L]] <- "Constant Leverage:\n Residuals vs Factor Levels" ## plot against factor-level combinations instead aterms <- attributes(terms(x)) ## classes w/o response dcl <- aterms$dataClasses[ -aterms$response ] facvars <- names(dcl)[dcl %in% c("factor", "ordered")] mf <- model.frame(x)[facvars]# better than x$model if(ncol(mf) > 0) { dm <- data.matrix(mf) ## #{levels} for each of the factors: nf <- length(nlev <- unlist(unname(lapply(x$xlevels, length)))) ff <- if(nf == 1) 1 else rev(cumprod(c(1, nlev[nf:2]))) facval <- (dm-1) %*% ff xx <- facval # for use in do.plot section. dev.hold() plot(facval, rsp, xlim = c(-1/2, sum((nlev-1) * ff) + 1/2), ylim = ylim, xaxt = "n", main = main, xlab = "Factor Level Combinations", ylab = ylab5, type = "n", ...) ##################################################################################### # # if(!is.null(subset)){ panel(facval[subset],rsp[subset], col.smooth = subs.col, type=subs.type, pch=8) } # # ##################################################################################### axis(1, at = ff[1L]*(1L:nlev[1L] - 1/2) - 1/2, labels = x$xlevels[[1L]]) mtext(paste(facvars[1L],":"), side = 1, line = 0.25, adj=-.05) abline(v = ff[1L]*(0:nlev[1L]) - 1/2, col="gray", lty="F4") panel(facval, rsp, ...) abline(h = 0, lty = 3, col = "gray") dev.flush() } else { # no factors message(gettextf("hat values (leverages) are all = %s\n and there are no factor predictors; no plot no. 5", format(mean(r.hat))), domain = NA) frame() do.plot <- FALSE } } else { ## Residual vs Leverage xx <- hii ## omit hatvalues of 1. xx[xx >= 1] <- NA dev.hold() plot(xx, rsp, xlim = c(0, max(xx, na.rm = TRUE)), ylim = ylim, main = main, xlab = "Leverage", ylab = ylab5, type = "n", ...) panel(xx, rsp, ...) ##################################################################################### # # if(!is.null(subset)){ panel(hii[subset],rsp[subset], col.smooth = subs.col, type=subs.type, pch=8) } # # ##################################################################################### abline(h = 0, v = 0, lty = 3, col = "gray") if (one.fig) title(sub = sub.caption, ...) if(length(cook.levels)) { p <- length(coef(x)) usr <- par("usr") hh <- seq.int(min(r.hat[1L], r.hat[2L]/100), usr[2L], length.out = 101) for(crit in cook.levels) { cl.h <- sqrt(crit*p*(1-hh)/hh) lines(hh, cl.h, lty = 2, col = 2) lines(hh,-cl.h, lty = 2, col = 2) } legend("bottomleft", legend = "Cook's distance", lty = 2, col = 2, bty = "n") xmax <- min(0.99, usr[2L]) ymult <- sqrt(p*(1-xmax)/xmax) aty <- c(-sqrt(rev(cook.levels))*ymult, sqrt(cook.levels)*ymult) axis(4, at = aty, labels = paste(c(rev(cook.levels), cook.levels)), mgp = c(.25,.25,0), las = 2, tck = 0, cex.axis = cex.id, col.axis = 2) } dev.flush() } # if(const h_ii) .. else .. if (do.plot) { mtext(getCaption(5), 3, 0.25, cex = cex.caption) if (id.n > 0) { y.id <- rsp[show.rsp] y.id[y.id < 0] <- y.id[y.id < 0] - strheight(" ")/3 text.id(xx[show.rsp], y.id, show.rsp) } } } if (show[6L]) { g <- dropInf( hii/(1-hii), hii ) ymx <- max(cook, na.rm = TRUE)*1.025 dev.hold() plot(g, cook, xlim = c(0, max(g, na.rm=TRUE)), ylim = c(0, ymx), main = main, ylab = "Cook's distance", xlab = expression("Leverage " * h[ii]), xaxt = "n", type = "n", ...) panel(g, cook, ...) ##################################################################################### # # if(!is.null(subset)){ panel(g[subset],cook[subset], col.smooth = subs.col, type=subs.type, pch=8) } # # ##################################################################################### ## Label axis with h_ii values athat <- pretty(hii) axis(1, at = athat/(1-athat), labels = paste(athat)) if (one.fig) title(sub = sub.caption, ...) p <- length(coef(x)) bval <- pretty(sqrt(p*cook/g), 5) usr <- par("usr") xmax <- usr[2L] ymax <- usr[4L] for(i in seq_along(bval)) { bi2 <- bval[i]^2 if(ymax > bi2*xmax) { xi <- xmax + strwidth(" ")/3 yi <- bi2*xi abline(0, bi2, lty = 2) text(xi, yi, paste(bval[i]), adj = 0, xpd = TRUE) } else { yi <- ymax - 1.5*strheight(" ") xi <- yi/bi2 lines(c(0, xi), c(0, yi), lty = 2) text(xi, ymax-0.8*strheight(" "), paste(bval[i]), adj = 0.5, xpd = TRUE) } } ## axis(4, at=p*cook.levels, labels=paste(c(rev(cook.levels), cook.levels)), ## mgp=c(.25,.25,0), las=2, tck=0, cex.axis=cex.id) mtext(getCaption(6), 3, 0.25, cex = cex.caption) if (id.n > 0) { show.r <- order(-cook)[iid] text.id(g[show.r], cook[show.r], show.r) } dev.flush() } ###################################################################################### # # if (show[7L]) { xlabel<-if(rank.type=="fit") "Ranked Fit" else "Ranked Response" plot(ranked.data, STUD, xlab=xlabel, ylab="Studentized Residuals", type="n", ylim=range(STUD,na.rm=TRUE), main="Ranked gg. Stud. Residuals") panel(ranked.data, STUD) if(!is.null(subset)){ panel(ranked.data[subset],STUD[subset], col.smooth = subs.col, type=subs.type, pch=8) } if(id.n > 0) { y.id <- STUD[show.STUD] y.id[y.id < 0] <- y.id[y.id < 0] - strheight(" ")/3 text.id(ranked.data[show.STUD], y.id, show.STUD) } abline(h = 0, lty = 3, col = "gray") } # # ##################################################################################### ##################################################################################### # # if (show[8L]) { oldpar <- par(no.readonly=TRUE) par(mfrow=c(2,2), lab=c(nrow(ad1),5,7)) plot(ad1$Df, type="h", ylab="Df", xlab="Index", xlim=c(0.5,nrow(ad1)+0.5), ylim=c(0,max(ad1$Df[-1]))) title(main="added variable plots") plot(ad1$"Sum of Sq",type="h", ylab="Diff. of Res. squared", xlab="Index", xlim=c(0.5,nrow(ad1)+0.5), ylim=c(min(ad1$"Sum of Sq"[-1])*0.9,max(ad1$"Sum of Sq"[-1]))) par(lab=c(nrow(ad1),5,7)) plot(ad1$RSS, ylab="RSS", type="h", xlab="Index", xlim=c(0.5,nrow(ad1)+0.5), ylim=c(min(ad1$RSS)*0.9,max(ad1$RSS))) title(main="index 1 = no variables added") plot(ad1$AIC, ylab="AIC", type="h", xlab="Index", xlim=c(0.5,nrow(ad1)+0.5), ylim=c(min(ad1$AIC)*0.9,max(ad1$AIC))) par(oldpar) } # # ##################################################################################### if (!one.fig && par("oma")[3L] >= 1) mtext(sub.caption, outer = TRUE, cex = cex.oma.main) invisible() }