3.2 Main loop
Loop over the individual sites, doing the following:
- compose the site .csv file name
- read the input data
- discard samples with mssing ages
- discard samples with ages after 2020 CE
- initial minimax rescaling (for historical reasons) (
minimax
) - set
alpha
the Box-Cox transformation shift parameter - maximum likelihood estimation of
lambda
- Box-Cox transformation of data
- minimax rescaling
tall
- calculate mean and standard deviation of data over base period
- calculate z-scores
ztrans
- write out transformed data for this site
# main loop
for (j in seq(1,nsites)) {
# 1. site .csv file name (input file)
sitenum <- sites[j,1]
sitename <- as.character(sites[j,5])
siteidchar <- as.character(sitenum)
if (sitenum >= 1) siteid <- paste("000", siteidchar, sep="")
if (sitenum >= 10) siteid <- paste("00", siteidchar, sep="")
if (sitenum >= 100) siteid <- paste("0", siteidchar, sep="")
if (sitenum >= 1000) siteid <- paste( siteidchar, sep="")
inputfile <- paste(sitecsvpath, siteid, "_data.csv", sep="")
print(j); print(sitenum); print(sitename); print(inputfile)
# 2. read the input data
sitedata <- read.csv(inputfile)
nsamp <- length(sitedata[,1])
nsampchar <- as.character(nsamp)
writeLines(paste("Site",siteidchar,nsampchar,"samples", sep=" "), con = debugfile, sep = "\n")
# 3. discard samples with missing (-9999) ages
sitedata <- sitedata[sitedata$est_age != -9999,]
# 4. discard samples with ages > -70
sitedata <- sitedata[sitedata$est_age > -70,]
# 5. initial minimax rescaling of data
minimax <- (sitedata$influx-min(sitedata$influx))/(max(sitedata$influx)-min(sitedata$influx))
# 6. set `alpha` the Box-Cox transformation shift parameter
alpha <- 0.01 # Box-Cox shift parameter
# alternative alpha: 0.5 times the smallest nonzero value of influx
# alpha <- 0.5*min(sitedata$influx[sitedata$influx != 0])
# 7. maximum likelihood estimation of lambda
# derived from the boxcox.R function in the Venables and Ripley MASS library included in R 2.6.1
npts <- 201 # number of estimates of lambda
y <- minimax+alpha
n <- length(y)
logy <- log(y)
ydot <- exp(mean(logy))
lasave <- matrix(1:npts)
liksave <- matrix(1:npts)
for (i in 1:npts) {
la <- -2.0+(i-1)*(4/(npts-1))
if (la != 0.0) yt <- (y^la-1)/la else yt <- logy*(1+(la*logy)/2*(1+(la*logy)/3*(1+(la*logy)/4)))
zt <- yt/ydot^(la-1)
loglik <- -n/2*log(sum((zt - mean(zt))^2 ))
lasave[i] <- la
liksave[i] <- loglik
}
# save the maximum likelihood value and the associated lambda
maxlh <- liksave[which.max(liksave)]
lafit <- lasave[which.max(liksave)]
print (c(sitenum, maxlh, lafit))
# 8. Box-Cox transformation of data
if (lafit == 0.0) tall <- log(y) else tall <- (y^lafit - 1)/lafit
# 9. minimax rescaling
tall <- (tall - min(tall))/(max(tall)-min(tall))
# 10. calculate mean and standard deviation of data over base period
tmean <- mean(tall[sitedata$est_age >= basebeg & sitedata$est_age <= baseend])
tstdev <- sd(tall[sitedata$est_age >= basebeg & sitedata$est_age <= baseend])
# 11. calculate z-scores
ztrans <- (tall-tmean)/tstdev
# 12. write out transformed data for this site
siteout <- data.frame(cbind(sitedata[,1], sitedata$id_sample, sitedata$est_age,
sitedata$depth, sitedata$influx, minimax, tall, ztrans))
colnames(siteout) <- c("site_sample", "id_sample", "est_age", "depth", "influx", "influxmnx", "tall", "zt")
outputfile <- paste(transcsvpath, siteid, "_trans_influx_", basename, ".csv", sep="")
write.table(siteout, outputfile, col.names=TRUE, row.names=FALSE, sep=",")
sn_save[j] <- sitenum
lam_save[j] <- lafit
lik_save[j] <- maxlh
tmean_save[j] <- tmean
tstdev_save[j] <- tstdev
}
As the loop executes, one block of information for each site will be printed.
## [1] 1
## [1] 1
## [1] "Cygnet "
## [1] "/Projects/GPWG/GPWGv3/data/v3plus/v3plus_sites_csv/0001_data.csv "
## [1] 1.0000 -527.6608 0.2200
## ...