# trans-and-zscore.R
# 1-parameter Box-Cox transformation of charcoal quanities for a single site
# (alpha (shift parameter) is specified, lambda (power transformation parameter) is estimated)
# input .csv files should contain at least the variables "est_age" and "quant",
# identified by labels in a header row
# paths for input and output .csv files -- modify as appropriate
queryname <- "v3i"
datapath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/"
sitelistpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_sitelists/"
sitelist <- "v3i_nsa_globe"
# set base period ages (or read a baseperiod info file)
basebeg <- 200
baseend <- 21000
basename <- "zt21k"
# debug/log file
debugpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_debug/"
# if debug folder does not exist, create it
dir.create(file.path(debugpath), showWarnings=FALSE)
debugname <- "trans-and-zscore_debug.txt"
# open the debug/log file
debugfile <- file(paste(debugpath, debugname, sep=""), "w")
# no changes below here
# various path and filenames
sitelistfile <- paste(sitelistpath, sitelist, ".csv", sep="")
sitelistfile
sitecsvpath <- paste(datapath,queryname,"_sites_csv/",sep="")
transcsvpath <- paste(datapath,queryname,"_trans_csv/",sep="")
# if output folder does not exist, create it
dir.create(file.path(datapath, paste(queryname,"_trans_csv/",sep="")), showWarnings=FALSE)
statscsvpath <- paste(datapath,queryname,"_stats/",sep="")
# if output folder does not exist, create it
dir.create(file.path(datapath, paste(queryname,"_stats/",sep="")), showWarnings=FALSE)
statsfile <- paste(statscsvpath,queryname,"_",basename,"_stats.csv", sep="")
# read list of sites
ptm <- proc.time()
sites <- read.csv(paste(sitelistfile), stringsAsFactors=FALSE)
nsites <- length(sites[,1])
print (nsites)
# storage for statistics
sn_save <- rep(0,nsites); lam_save <- rep(0,nsites); lik_save <- rep(0,nsites)
tmean_save <- rep(0,nsites); tstdev_save <- rep(0,nsites)
# main loop
for (j in seq(1,nsites)) {
# 1. site .csv file name (input file)
sitenum <- sites[j,1]
sitename <- as.character(sites[j,5])
siteidchar <- as.character(sitenum)
if (sitenum >= 1) siteid <- paste("000", siteidchar, sep="")
if (sitenum >= 10) siteid <- paste("00", siteidchar, sep="")
if (sitenum >= 100) siteid <- paste("0", siteidchar, sep="")
if (sitenum >= 1000) siteid <- paste( siteidchar, sep="")
inputfile <- paste(sitecsvpath, siteid, "_data.csv", sep="")
print(j); print(sitenum); print(sitename); print(inputfile)
# 2. read the input data
sitedata <- read.csv(inputfile)
nsamp <- length(sitedata[,1])
nsampchar <- as.character(nsamp)
writeLines(paste("Site",siteidchar,nsampchar,"samples", sep=" "), con = debugfile, sep = "\n")
# 3. discard samples with missing (-9999) ages
sitedata <- sitedata[sitedata$est_age != -9999,]
# 4. discard samples with ages > -70
sitedata <- sitedata[sitedata$est_age > -70,]
# 5. initial minimax rescaling of data
minimax <- (sitedata$influx-min(sitedata$influx))/(max(sitedata$influx)-min(sitedata$influx))
# 6. set `alpha` the Box-Cox transformation shift parameter
alpha <- 0.01 # Box-Cox shift parameter
# alternative alpha: 0.5 times the smallest nonzero value of influx
# alpha <- 0.5*min(sitedata$influx[sitedata$influx != 0])
# 7. maximum likelihood estimation of lambda
# derived from the boxcox.R function in the Venables and Ripley MASS library included in R 2.6.1
npts <- 201 # number of estimates of lambda
y <- minimax+alpha
n <- length(y)
logy <- log(y)
ydot <- exp(mean(logy))
lasave <- matrix(1:npts)
liksave <- matrix(1:npts)
for (i in 1:npts) {
la <- -2.0+(i-1)*(4/(npts-1))
if (la != 0.0) yt <- (y^la-1)/la else yt <- logy*(1+(la*logy)/2*(1+(la*logy)/3*(1+(la*logy)/4)))
zt <- yt/ydot^(la-1)
loglik <- -n/2*log(sum((zt - mean(zt))^2 ))
lasave[i] <- la
liksave[i] <- loglik
}
# save the maximum likelihood value and the associated lambda
maxlh <- liksave[which.max(liksave)]
lafit <- lasave[which.max(liksave)]
print (c(sitenum, maxlh, lafit))
# 8. Box-Cox transformation of data
if (lafit == 0.0) tall <- log(y) else tall <- (y^lafit - 1)/lafit
# 9. minimax rescaling
tall <- (tall - min(tall))/(max(tall)-min(tall))
# 10. calculate mean and standard deviation of data over base period
tmean <- mean(tall[sitedata$est_age >= basebeg & sitedata$est_age <= baseend])
tstdev <- sd(tall[sitedata$est_age >= basebeg & sitedata$est_age <= baseend])
# 11. calculate z-scores
ztrans <- (tall-tmean)/tstdev
# 12. write out transformed data for this site
siteout <- data.frame(cbind(sitedata[,1], sitedata$id_sample, sitedata$est_age,
sitedata$depth, sitedata$influx, minimax, tall, ztrans))
colnames(siteout) <- c("site_sample", "id_sample", "est_age", "depth", "influx", "influxmnx", "tall", "zt")
outputfile <- paste(transcsvpath, siteid, "_trans_influx_", basename, ".csv", sep="")
write.table(siteout, outputfile, col.names=TRUE, row.names=FALSE, sep=",")
sn_save[j] <- sitenum
lam_save[j] <- lafit
lik_save[j] <- maxlh
tmean_save[j] <- tmean
tstdev_save[j] <- tstdev
}
# write out a file of statistics
stats <- data.frame(cbind(sn_save, lam_save, lik_save, tmean_save, tstdev_save))
colnames(stats) <- c("site", "lambda", "likelihood", "mean", "stdev")
write.table(stats, statsfile, col.names=TRUE, row.names=FALSE, sep=",")
proc.time() - ptm