# mdb-to-query.R
# directly reads an Access database and creates individual site .csv files
# database name
dbname <- "GCDv03_Marlon_et_al_2015.mdb"
# query label and path to query and query name
datapath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/"
querypath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_query/"
# if the query output folder does not exist, create it
dir.create(file.path(querypath), showWarnings=FALSE)
# query file names
querysitename <- "v3i_sites.csv"
querydataname <- "v3i_data.csv"
# path to .csv output
csvpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_sites_csv/"
# if output folder does not exist, create it
dir.create(file.path(csvpath), showWarnings=FALSE)
# path to sitelist output
sitelistpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_sitelists/"
# if output folder does not exist, create it
dir.create(file.path(sitelistpath), showWarnings=FALSE)
# sitelist output label
sitelistname <- "v3i_all"
# debug/log file
debugpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_debug/"
# if debug folder does not exist, create it
dir.create(file.path(debugpath), showWarnings=FALSE)
debugname <- "mdb-to-csv_debug.txt"
# open the debug/log file
debugfile <- file(paste(debugpath, debugname, sep=""), "w")
# setup
maxsites <- 2000
maxsamples <- 9000
miss <- -9999.0
# load RODBC library and connect to the database
library(RODBC)
gcdv3.db <- odbcConnect(dbname)
odbcGetInfo(gcdv3.db)
# check for existence of database site and data views
sqlTables(gcdv3.db, tableName="ALL_BART_SITES", tableType="VIEW")
sqlColumns(gcdv3.db, "ALL_BART_SITES")$COLUMN_NAME
sqlTables(gcdv3.db, tableName="ALL_BART_DATA", tableType="VIEW")
sqlColumns(gcdv3.db, "ALL_BART_DATA")$COLUMN_NAME
# site query
site_query <- sqlFetch(gcdv3.db, "ALL_BART_SITES")
names(site_query)[3] <- "LATITUDE"; names(site_query)[4] <- "LONGITUDE"
site_query$SITE_NAME <- as.character(site_query$SITE_NAME)
head(site_query)
str(site_query)
# data query
data_query <- sqlFetch(gcdv3.db, "ALL_BART_DATA")
names(data_query)[4] <- "DEPTH"
head(data_query)
str(data_query)
# close the database
odbcClose(gcdv3.db)
# site .csv file
sitecsvpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_query/"
write.csv(site_query, paste(sitecsvpath, querysitename, sep=""), row.names=FALSE)
# data .csv file
datacsvpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_query/"
write.csv(data_query, paste(datacsvpath, querydataname, sep=""), row.names=FALSE)
# rewrite sitefile as .csv file for sorting by region and depositional context
sitelist <- data.frame(site_query$ID_SITE, site_query$LATITUDE, site_query$LONGITUDE, site_query$ELEV,
site_query$ID_DEPO_CONTEXT, site_query$SITE_NAME, stringsAsFactors = FALSE)
names(sitelist) <- c("Site_ID", "Lat", "Lon", "Elev", "depo_context", "Site_Name")
head(sitelist)
str(sitelist)
sitelistfile <- paste(sitelistpath, sitelistname, ".csv", sep="")
write.table(sitelist, sitelistfile, row.names=FALSE, sep=",")
# loop over sites
for (j in 1:maxsites) {
nsamp <- 0
sitedata <- data_query[data_query$ID_SITE == j, ]
nsamp <- length(sitedata$ID_SITE)
head(sitedata)
tail(sitedata)
# local variables
if (nsamp > 0) {
jchar <- as.character(j)
nsampchar <- as.character(nsamp)
writeLines(paste("Site",jchar,nsampchar,"samples", sep=" "), con = debugfile, sep = "\n")
# local variables
depth <- sitedata$DEPTH; age <- sitedata$EST_AGE; quant <- sitedata$QUANTITY
depth[is.na(depth)] <- miss
age[is.na(age)] <- miss
quant[is.na(quant)] <- miss
thickness <- rep(miss, nsamp); dep_time <- rep(miss, nsamp); sed_rate <- rep(miss, nsamp)
unit_dep_time <- rep(miss, nsamp)
xst_level <- as.character(sitedata[1,9])
# sed rate and deposition time
# first (top) sample
if (depth[1] != miss && depth[2] != miss) {
thickness[1] <- (depth[2] - depth[1])*100.0 # meters to cm (depth in m, influx and conc in cm)
dep_time[1] <- age[2] - age[1]
if (dep_time[1] > 0.0) sed_rate[1] <- thickness[1]/dep_time[1]
if (sed_rate[1] != miss) unit_dep_time[1] <- 1.0/sed_rate[1]
}
# samples 2 to nsamp-1
for (i in 2:(nsamp-1)) {
if (depth[1] != miss && depth[2] != miss) {
thickness[i] <- (depth[i+1] - depth[i])*100.0
dep_time[i] <- ((age[i+1] + age[i])/2.0) - ((age[i] + age[i-1])/2.0)
if (dep_time[i] > 0.0) sed_rate[i] <- thickness[i]/dep_time[i]
if (sed_rate[i] != miss) unit_dep_time[i] <- 1.0/sed_rate[i]
}
}
# last (bottom) sample
if (depth[nsamp-1] != miss && depth[nsamp] != miss) {
thickness[nsamp] <- thickness[nsamp-1] # replicate thickness
dep_time[nsamp] <- age[nsamp] - age[nsamp-1]
sed_rate[nsamp] <- sed_rate[nsamp-1] # replicate sed_rate
unit_dep_time[nsamp] <- unit_dep_time[nsamp-1]
}
# counts of missing values
depth_count <- 0; age_count <- 0; quant_count <- 0; sed_rate_count <- 0; sed_rate_flag <- 1
depth_count <- sum(depth != miss)
age_count <- sum(age != miss)
quant_count <- sum(quant != miss)
sed_rate_count <- sum(sed_rate != miss)
if (sed_rate_count != nsamp) sed_rateflag = 0
# check for age or depth reversals, and zero or negative sed rates (in nonmissing data)
depth_reversal <- 0; age_reversal <- 0; sed_rate_zeroneg <- 0
for (i in 2:nsamp) {
if (age[i] != miss && age[i-1] != miss && age[i] <= age[i-1]) age_reversal=1
if (depth[i] != miss && depth[i-1] != miss) {
if (depth[i] <= depth[i-1]) depth_reversal=1
}
}
for (i in 2:nsamp) {
if (sed_rate[i] != miss && sed_rate[i] <= 0.0) sed_rate_zeroneg=1
}
# set and write out various flags
if (depth_count != 0 && depth_count != nsamp) {
writeLines(paste("**** has a missing depth when some are nonmissing", sep=" "), con = debugfile, sep = "\n")
}
if (age_count != 0 && age_count != nsamp) {
writeLines(paste("**** has a missing age when some are nonmissing", sep=" "), con = debugfile, sep = "\n")
}
if (quant_count != 0 && quant_count != nsamp) {
writeLines(paste("**** has a missing quantity when some are nonmissing", sep=" "), con = debugfile, sep = "\n")
}
if (sed_rate_count != 0 && sed_rate_count != nsamp) {
writeLines(paste("**** has a missing sed rate when some are nonmissing", sep=" "), con = debugfile, sep = "\n")
}
if (depth_reversal != 0) {
writeLines(paste("**** has a depth reversal", sep=" "), con = debugfile, sep = "\n")
}
if (age_reversal != 0) {
writeLines(paste("**** has an age reversal", sep=" "), con = debugfile, sep = "\n")
}
if (sed_rate_zeroneg != 0) {
writeLines(paste("**** has zero or negative sed rates", sep=" "), con = debugfile, sep = "\n")
}
# alternative quantities
conc <- rep(miss, nsamp); influx <- rep(miss, nsamp)
influx_source <- rep("none", nsamp) ; conc_source <- rep("none", nsamp)
# select case based on xst_level
if (xst_level == "INFL") # adopt influx values as they are, calculate concentration
{
influx <- quant
influx_source <- "data"
if (influx != miss && unit_dep_time != miss && sed_rate != 0.0) {
conc <- influx * unit_dep_time
conc_source <- "calculated from influx "
} else {
conc <- quant
conc_source <- "copied from quant "
}
writeLines("INFL", con = debugfile, sep = "\n")
}
else if (xst_level == "CONC") # calculate influx, adopt conc values as they are
{
conc <- quant
conc_source <- "data"
if (conc != miss && sed_rate != miss && sed_rate != 0.0) {
influx <- quant * sed_rate
influx_source <- "calculated from conc "
} else {
influx <- quant
influx_source <- "copied from quant "
}
writeLines("CONC", con = debugfile, sep = "\n")
}
else if (xst_level == "C0P0") # assume quantity is concentration like
{
conc <- quant
conc_source <- "C0P0"
if (sed_rate != miss && sed_rate != 0.0) {
influx <- quant * sed_rate
influx_source <- "calculated from C0P0 (conc) "
} else {
influx <- quant
influx_source <- "copied from quant "
}
writeLines("C0P0", con = debugfile, sep = "\n")
}
else if (xst_level == "SOIL") # just copy
{
conc <- quant
conc_source <- "copied from quant "
influx <- quant
influx_source <- "copied from quant "
writeLines("SOIL", con = debugfile, sep = "\n")
}
else if (xst_level == "OTHE") # just copy
{
conc <- quant
conc_source <- "copied from quant "
influx <- quant
influx_source <- "copied from quant "
writeLines("OTHE", con = debugfile, sep = "\n")
}
else
{
conc <- quant
conc_source <- "copied from quant "
influx <- quant
influx_source <- "copied from quant "
writeLines("Unknown", con = debugfile, sep = "\n")
}
}
# check for influx == 0.0 everywhere
nzero <- 0
nzero <- sum(influx != 0.0)
if (nzero == 0) {
writeLines(paste("**** has no non-zero influx values", sep=" "), con = debugfile, sep = "\n")
}
# .csv out
if (nsamp > 0 && nzero > 0) {
# get siteid string
siteidchar <- as.character(j)
if (j >= 1) siteid <- paste("000", siteidchar, sep="")
if (j >= 10) siteid <- paste("00", siteidchar, sep="")
if (j >= 100) siteid <- paste("0", siteidchar, sep="")
if (j >= 1000) siteid <- paste( siteidchar, sep="")
sitehdr <- paste("site", siteid, sep="")
# assemble output data and write it out
samplenum <- seq(1:nsamp)
outdata <- data.frame(samplenum,sitedata$ID_SAMPLE, depth, age, sed_rate, quant, conc,
influx, xst_level, conc_source, influx_source)
names(outdata) <- c(sitehdr, "id_sample", "depth", "est_age", "sed_rate", "quant", "conc",
"influx", "xst_level", "conc_source", "influx_source" )
csvfile <- paste(csvpath,siteid,"_data.csv", sep="")
write.csv(outdata, csvfile, row.names=FALSE)
}
}
close(debugfile)