hs97<-function(X, y, alpha = 0.05) { # This S-Plus function computes estimates based on the procedure # of Hadi and Simonoff (1997) and finds the corresponding set of # outliers. This code is based on code written by Merrill Heddy. # # This code requires the Matrix library of S-Plus; before calling it, # enter the command # library(Matrix) # If you do not have access to the Matrix library, you will need to make a few # changes. First, create the following determinant function: # det <- function(M) Re(prod(eigen(M, only.values = T)$values)) # The form Det is used in the Hadi94 code, so also add the command # Det <- det # You will need to change one line in the hs97 function: # ddet <- det(t(Xs) %*% Xs, logarithm = F)$modulus # to # ddet <- det(t(Xs) %*% Xs) # # Input parameters # y is an nx1 response vector # X is an nxp matrix of explantatory variables (no column of 1's assumed in input) # alpha is the significance level for the test in step 3.a. of the Main Algorithm # # Output parameters # outliers: the set of outlier observations # regline: lm object of rejection-plus-least squares regression estimate # # Reference: A.S. Hadi and J.S. Simonoff, "A More Robust Outlier Identifier # for Regression Data" (1997), Bulletin of the International Statistical # Institute, 281-282. # X <- cbind(1, X) n <- nrow(X) p <- ncol(X) # # Using Hadi's (1994) robust distances as step 0 of algorithm # d <- Hadi94(X[, 2:p], alpha=alpha)$D2[, "rob.D"] t1 <- rep(0, length = n) tt <- rep(0, length = n) Ind <- order(d) medio <- floor((n + p - 1)/2) j <- p + 1 bdet<-0 while(bdet==0){ Jnd <- Ind[1:j] Xs <- X[Jnd, ] ddet<-det(t(Xs) %*% Xs,logarithm=F)$modulus if (abs(ddet<1.e-13)) j<-(j+1) else bdet<-1} # Beginning of while loop to find initial subset M while(j <= medio) { # beginning of while loop for finding subset M Jnd <- Ind[1:j] Xs <- X[Jnd, ] ys <- y[Jnd] beta <- solve(t(Xs) %*% Xs) %*% t(Xs) %*% ys e <- y - X %*% beta sind <- sqrt(t(e[Jnd]) %*% e[Jnd]/(length(Jnd) - p)) iXs <- solve(t(Xs) %*% Xs) h <- diag(X %*% iXs %*% t(X)) t1[Ind[1:j]] <- e[Ind[1:j]]/sqrt(1 - h[Ind[1:j]]) t1[Ind[(j+1):n]]<- e[Ind[(j + 1):n]]/sqrt(1 + h[Ind[(j + 1):n]]) Ind <- order(abs(t1)) j <- (j + 1) } # end of while loop for finding subset M rr <- c(0) while((rr == 0) & (j <= (n - 1))) { # The loop for The Main Algorithm Jnd <- Ind[1:j] Xs <- X[Jnd, ] ys <- y[Jnd] beta <- solve(t(Xs) %*% Xs) %*% t(Xs) %*% ys e <- y - X %*% beta sind <- sqrt(t(e[Jnd]) %*% e[Jnd]/(length(Jnd) - p)) iXs <- solve(t(Xs) %*% Xs) for(i in (j + 1):n) { h[Ind[i]] <- X[Ind[i], ] %*% iXs %*% X[Ind[i], ] t1[Ind[i]] <- e[Ind[i]]/sqrt(1 + h[Ind[i]]) } for(i in 1:j) { h[Ind[i]] <- X[Ind[i], ] %*% iXs %*% X[Ind[i], ] t1[Ind[i]] <- e[Ind[i]]/sqrt(1 - h[Ind[i]]) } tt[Ind] <- t1[Ind]/sind Ind <- order(abs(tt)) u <- sort(abs(tt)) if(u[j + 1] > qt(1 - alpha/(2 * (j + 1)), j - p)) { rr <- 1 } else { j <- j + 1 } } # End of the loop for The Main Algorithm step K <- (1:n)[abs(tt) > qt(1 - alpha/(2 * (j + 1)), j - p)] y[K] <- NA X <- X[, 2:ncol(X)] out <- lm(y ~ X, na.action = na.omit) return(list(outliers = K, regline = out)) } Hadi94 <- function(X, alpha = 0.05, rc.tol = 1e-7, print.diag = FALSE) { ## ------------------------------------------------------------------------- ## Hadi, Ali S. (1994), "A Modification of a Method for the ## Detection of Outliers in Multivariate Samples," JRSS(B), 2, 393-396 ## -------------------------------------------------------------------------- ## rc.tol : tolerance for reciprocal condition number of S, rcond(S). ## -------------------------------------------------------------------------- call <- match.call() X <- as.matrix(X)# does this the right thing for data.frames w/ factors? n <- dim(X) [1] p <- dim(X) [2] h <- (n + p + 1) %/% 2 if(n <= 3*p+1) stop("Hadi 94: Not enough observations (n <= 3p+1)") ##-- NOTE (MMae): n > 3p+1 ==> h >= 2p+1 id <- 1:n cf <- (1 + (p + 1)/(n - p) + 2/(n - 1 - 3*p) )^2 #-- correction factor ##cf <- (1 + (p + 1)/(n - p) + 1/(n - p - h) )^2 qChi <- qchisq(1- alpha/n, p) # Bonferroni corrected critical value tol <- 10^-min(p+5, 12)#- not really used anymore... prt.diag <- if(as.logical(print.diag)) function(txt, h,n,Z, do.Z = TRUE) { cat(" ..H94:", txt, "; h=", h) if(do.Z) cat(". Z[]=",Z[1:min(n,12)], if(n>12)"..") cat("\n") } else function(...) NULL ## ------------------------------------------------------------- ##**** Compute Mahalanobis distances **** ## ------------------------------------------------------------- Sx <- var(X) dS <- Det(Sx) rcS<- Rcond(Sx) if (dS < tol || rcS < rc.tol) { prt.diag(paste("MD: `near singular'", if(rcS >= rc.tol) "-- still ok: 1/cond(Sx)=",format(rcS)), h=h,n=n,Z=0, do.Z=FALSE) if(rcS < rc.tol) stop(paste("Hadi94, MD: 1/cond(Sx) < rc.tol=",format(rc.tol), "; cond(Sx)=",format(rcS))) } mX <- apply(X, 2, mean) MD <- mahalanobis(X, mX, Sx) ##\ will be mah.out <- id[MD >= qChi] ##/ returned ## ------------------------------------------------------------- ## **** Step 0 **** ## ------------------------------------------------------------- ## **** Compute Di(Cm, Sm) medX <- apply(X, 2, median) ##NO ! medX <- t(array(medX, dim = c(n, p))) S <- crossprod(X-medX)/(n-1) ## Y <- X - medX; S <- ((n - 1)^-1)*(t(Y) %*% Y) Z <- sort.list(D <- mahalanobis(X, medX, S)) prt.diag("after Mahala",h,n,Z) ## ------------------------------------------------------------- ## **** Compute Di(Cv, Sv) using (>=) h obs. === Step 0, (b) == repeat { S <- var(Y <- as.matrix(X[Z[1:h], ])) if (Rcond(S) > rc.tol) { D <- mahalanobis(X, apply(Y, 2, mean), S) Z <- sort.list(D) break } else h <- h + 1 } prt.diag("after step 0",h,n,Z) ## ------------------------------------------------------------- ## **** Step 1 **** (we know that h >= 2p + 1) ## ------------------------------------------------------------- for(r in (p+1):h) { S <- var(Y <- as.matrix(X[Z[1:r],])) # Correction in following two lines due to Tore Wentzel-Larsen rcr<-Rcond(S);if (is.na(rcr)==1) rcr<-0 if (rcr > rc.tol) Z <- sort.list(mahalanobis(X, apply(Y, 2, mean), S)) } prt.diag("after step 1",h,n,Z) ## ------------------------------------------------------------- ## **** Step 3 **** ## ------------------------------------------------------------- ## **** Compute Di(Cb, Sb) out <- integer(0); new.order <- FALSE while(h < n) { S <- var(Y <- as.matrix(X[Z[1:h],])) if (Rcond(S) > rc.tol) {##-- can it be that this is NEVER true? new.order <- TRUE D <- mahalanobis(X, mY <- apply(Y, 2, mean), S) Z <- sort.list(D) if (D[Z[h + 1]] >= cf*qChi) { ##-- outlier set out <- Z[(h + 1) : n] prt.diag(paste("St.3: Outl. Z[",h+1,":",n,"]",sep=""), h, n, Z)#, do.Z=F) ##cat("D[Z[h + 1]]=",D[Z[h + 1]],"\n") break } } else prt.diag("St.3: SINGULAR D",h,n,Z) h <- h+1 } if(!new.order) {##-- compute D^2[i] prt.diag("End: >> RARE !!! <<< new.order was FALSE\n\t",h,n,Z) mY <- apply(Y, 2, mean) D <- mahalanobis(X, mY, S) } rnam <- dimnames(dst2 <- cbind(MD = MD, rob.D = D/cf))[[1]] rnam <- if(length(rnam)!=n) paste(id) else paste(rnam, format(id), sep=':') dimnames(dst2)[1] <- list(rnam) structure(list(call=call, Xbar=mX, Covariance = Sx, Outliers = sort(out), Mah.outl = sort(mah.out), Cb=mY, Sb=S, D2 = dst2, #Distances = sqrt(dst) cf = cf, qChi = qChi ), class = "Hadi94") }## Hadi94 # Auxiliary function; delete these lines if you do not have access to the Matrix # library, and insert the code for a determinant function if(exists("det", mode="function", where=1)) rm(det)# just in case library(Matrix) #-- defines 'det' det.default <- function(x,...) det(as.Matrix(x),...) Det <- function(..., logarithm = FALSE) { d <- unclass(det(..., logarithm = logarithm))$modulus attr(d,"logarithm") <- NULL d } Rcond<-function(vv) { # inverse conditional number of square matrix vv, # supposed to be symmetric and non-negatively defite, # used in Hadi94. Thanks to Tore Wentzel-Larsen. vv.d<-svd(vv,nu=0,nv=0)$d nn<-length(vv.d) sqrt(as.vector(vv.d[nn]/vv.d[1])) }