hsmeth2 <- function(X, y, alpha = 0.05) { # This S-Plus function computes estimates based on the procedure Method # 2 of Hadi and Simonoff (1993) and finds the corresponding set of # outliers. This code was written by Merrill Heddy. # # Input parameters # y is an nx1 response vector # X is an nxp matrix of explantatory variables (no column of 1's assumed in input) # alpha is the significance level for the test in step 3.a. of the Main Algorithm # # Output parameters # outliers: the set of outlier observations # regline: lm object of rejection-plus-least squares regression estimate # # Reference: A.S. Hadi and J.S. Simonoff, "Procedures for the Identification of # Multiple Outliers in Linear Models" (1993), Journal of the American # Statistical Association, 88, 1264-1272. # Z <- cbind(X, y) n <- nrow(Z) p <- ncol(Z) - 1 medio <- floor((n + p - 1)/2) W <- Z %*% solve(chol(var(Z))) Wclust <- hclust(dist(W), "connected") Wmerge <- Wclust$merge Wtot <- rep(0, (n - 1)) for(i in 1:(n - 1)) { if((Wmerge[i, 1] <= 0) & (Wmerge[i, 2] <= 0)) { Wtot[i] <- 2 } else if((Wmerge[i, 1] <= 0) & (Wmerge[i, 2] >= 0)) { Wtot[i] <- 1 + Wtot[Wmerge[i, 2]] } else if((Wmerge[i, 1] >= 0) & (Wmerge[i, 2] <= 0)) { Wtot[i] <- 1 + Wtot[Wmerge[i, 1]] } else { Wtot[i] <- (Wtot[Wmerge[i, 1]] + Wtot[Wmerge[i, 2]]) } } Wref <- rep(0, (n - 1)) for(i in 1:2) { for(j in 1:(n - 1)) { if(Wmerge[j, i] > 0) { Wref[Wmerge[j, i]] <- j } } } Wbase <- matrix(0, (n - 1), 4) Wbase <- cbind(Wmerge, Wref, Wtot) # Figure which data points are extreme enough <- rep(0, 1) lrtrack <- matrix(0, (n - 1), 2) curloc <- rep(0, 1) exdata <- rep(0, n) curloc <- (n - 1) while(enough < (n - medio)) { # check to see if branches are data points; make them extreme if((Wbase[curloc, 1] < 0) && (lrtrack[curloc, 1] == 0)) { exdata[-1 * Wbase[curloc, 1]] <- 1 lrtrack[curloc, 1] <- 1 enough <- (enough + 1) if(enough == (n - medio)) next } if((Wbase[curloc, 2] < 0) && (lrtrack[curloc, 2] == 0)) { exdata[-1 * Wbase[curloc, 2]] <- 1 lrtrack[curloc, 2] <- 1 enough <- (enough + 1) if(enough == (n - medio)) next } # check if both branches are done # if yes go up to mother node if((lrtrack[curloc, 1] == 1) && (lrtrack[curloc, 2] == 1)) { curloc <- Wbase[curloc, 3] next } # if left branch done do the right branch if(lrtrack[curloc, 1] == 1) { lrtrack[curloc, 2] <- 1 curloc <- Wbase[curloc, 2] next } # if right branch done do the left branch if(lrtrack[curloc, 2] == 1) { lrtrack[curloc, 1] <- 1 curloc <- Wbase[curloc, 1] next } # if niether branch done use weights to choose if(Wbase[Wbase[curloc, 1], 4] <= Wbase[Wbase[curloc, 2], 4]) { lrtrack[curloc, 1] <- 1 curloc <- Wbase[curloc, 1] } else { lrtrack[curloc, 2] <- 1 curloc <- Wbase[curloc, 2] } } # end of while loop Extreme <- (1:n)[exdata == 1] notExtreme <- (1:n)[exdata == 0] Ind <- c(notExtreme, Extreme) t1 <- rep(0, n) tt <- rep(0, n) d <- rep(0, n) h <- rep(0, n) j <- medio X <- cbind(1, X) j <- medio rr <- c(0) while((rr == 0) & (j <= (n - 1))) { # The loop for The Main Algorithm Jnd <- Ind[1:j] Xs <- X[Jnd, ] ys <- y[Jnd] beta <- solve(t(Xs) %*% Xs) %*% t(Xs) %*% ys e <- y - X %*% beta sind <- sqrt(t(e[Jnd]) %*% e[Jnd]/(length(Jnd) - p)) iXs <- solve(t(Xs) %*% Xs) for(i in (j + 1):n) { h[Ind[i]] <- X[Ind[i], ] %*% iXs %*% X[Ind[i], ] t1[Ind[i]] <- e[Ind[i]]/sqrt(1 + h[Ind[i]]) } for(i in 1:j) { h[Ind[i]] <- X[Ind[i], ] %*% iXs %*% X[Ind[i], ] t1[Ind[i]] <- e[Ind[i]]/sqrt(1 - h[Ind[i]]) } tt[Ind] <- t1[Ind]/sind Ind <- order(abs(tt)) u <- sort(abs(tt)) if(u[j + 1] > qt(1 - alpha/(2 * (j + 1)), j - p)) { rr <- 1 } else { j <- j + 1 } } # End of the loop for The Main Algorithm step K <- (1:n)[abs(tt) > qt(1 - alpha/(2 * (j + 1)), j - p)] y[K] <- NA X <- X[, 2:ncol(X)] out <- lm(y ~ X, na.action = na.omit) return(list(outliers = K, regline = out)) } # Auxiliary function if(exists("det", mode="function", where=1)) rm(det)# just in case library(Matrix) #-- defines 'det' det.default <- function(x,...) det(as.Matrix(x),...)