# presample-bin.R
# presamples, or bins the transformed data into evenly spaced bins with no interpolation  

# paths for input and output .csv files -- modify as appropriate
queryname <- "v3i"
datapath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/"
sitelistpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3plus/v3plus_sitelists/"
sitelist <- "v3i_nsa_globe"

## set basename and bin structure
basename <- "zt21k"
targbeg <- -60
targend <- 24000
targstep <- 10

# no changes below here

# various path and filenames
sitelistfile <- paste(sitelistpath,sitelistfile, ".csv", sep="")
transcsvpath <- paste(datapath,queryname,"_trans_csv/",sep="")
presampcsvpath <- paste(datapath,queryname,"_presamp_csv/",sep="")

# if output folder does not exist, create it
dir.create(file.path(datapath, paste(queryname,"_presamp_csv/",sep="")), showWarnings=FALSE)

# bin center (target points) definition
targage <- seq(targbeg, targend, by=targstep)

# read list of sites
ptm <- proc.time()
sites <- read.csv(sitelistfile, stringsAsFactors=FALSE)
nsites <- length(sitelist[,1])
print(nsites)

# main loop
for (j in seq(1,nsites)) {

  # 1. Compose the trans-and-zscore .csv file name
  sitenum <- sites[j,1]
  sitename <- as.character(sites[j,5])
  siteidchar <- as.character(sitenum)
  if (sitenum >= 1) siteid <- paste("000", siteidchar, sep="")
  if (sitenum >= 10) siteid <- paste("00", siteidchar, sep="")
  if (sitenum >= 100) siteid <- paste("0", siteidchar, sep="")
  if (sitenum >= 1000) siteid <- paste(    siteidchar, sep="")
  inputfile <- paste(transcsvpath, siteid, "_trans_influx_",basename,".csv", sep="")
  
  # 2. Read the input data
  sitedata <- read.csv(inputfile)
  nsamp <- length(sitedata$zt)
  
  # 3. Count the number of nonmissing (non-NA) and infinite influx values
  nonmiss <- na.omit(sitedata$zt)
  numnonmiss <- length(nonmiss)
  numinf <- sum(is.infinite(nonmiss))
  numnonmiss; numinf
  
  if (length(nonmiss) > 0 & numinf < numnonmiss) {
  
    # add a column of 1's for counting
    sitedata$one <- rep(1,length(sitedata[,1]))
    
    # 4. Find bin number of each sample
    # this definition of bin number seems to match that implicit in presample.f90
    binnum <- as.integer(ceiling((sitedata$est_age-targbeg-(targstep/2))/targstep))+1
    
    # uncommenting the following reveals how each sample is assigned to a bin
    # head(cbind(sitedata$est_age,sitedata$zt,binnum,targage[binnum]), 20)
    
    # 5. Get average zt values (and average ages) for the data in each bin
    binave <- tapply(sitedata$zt, binnum, mean)
    binaveage <- tapply(sitedata$est_age, binnum, mean)
    bincount <- tapply(sitedata$one, binnum, sum)
    
    # 6. Get bin numbers of each bin that had an average (or a single) value
    binsub <- as.numeric(unlist(dimnames(binave)))  
    
    # 7. Write output
    presampout <- data.frame(cbind(targage[binsub],binave,bincount))
    presampout <- na.omit(presampout)
    colnames(presampout) <- c("age", "zt", "np")
 
    outputfile <- paste(presampcsvpath, siteid, "_presamp_influx_",basename,"_bw",
      as.character(targstep),".csv", sep="")
    write.table(presampout, outputfile, col.names=TRUE, row.names=FALSE, sep=",")
  }
  
}
proc.time() - ptm