This script reads a GCD Microsoft Access data base, in this case, GCDv03_Marlon_et_al_2015.mdb
, downloaded from https://paleofire.org. These data were the basis of the analyses in Marlon et al. (2016, Biogeosciences). This database includes the results of two queries stored as database “views” (as opposed to tables), one for a list of sites and the other for the data. The views are read directly from the .mdb file. The files have the prefix v3i
, being the i-th (in alphabetical order, i.e. v3a
, v3b
, …, v3i
) version incorporating successive fix-ups of data problems.
The data query (i.e. the charcoal data for all sites) is parsed into individual “site” .csv files, while also doing various fixups, calculations of sedimentation rates, and converting, for example, charcoal concentrations into charcoal influx. The result is a set of .csv files that can be examined individually, or processed further by transformation, presampling or binning, and the development of composite curves. While aimed at producing the .csv files, the script also looks for and flags various inconsistencies in the data, like age reversals or zero or negative sedimentation rates. In addition to the individual .csv files, one per site, the script also generates a “site-query” .csv file listing the information for each site, and a “data-query” .csv file of all of the charcoal data.
The script also rewrites the site list query into a .csv file for convenient sorting into regional subsets.
Set up folder paths and filenames for the various input and output data. Set the path to the Access database (.mdb
) file.
dbname <- "GCDv03_Marlon_et_al_2015.mdb"
Create paths and folders for a) the query files, the individual .csv files, the sitelist .csv file.
# query label and path to query and query name
datapath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/"
querypath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_query/"
# if the query outpu folder does not exist, create it
dir.create(file.path(querypath), showWarnings=FALSE)
# query file names
querysitename <- "v3i_sites.csv"
querydataname <- "v3i_data.csv"
# path to .csv output
csvpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_sites_csv/"
# if output folder does not exist, create it
dir.create(file.path(csvpath), showWarnings=FALSE)
# path to sitelist output
sitelistpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_sitelists/"
# if output folder does not exist, create it
dir.create(file.path(sitelistpath), showWarnings=FALSE)
# sitelist output label
sitelistname <- "v3i_all"
Also create a “debug” or log file:
# debug/log file
debugpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_debug/"
# if debug folder does not exist, create it
dir.create(file.path(debugpath), showWarnings=FALSE)
debugname <- "mdb-to-csv_debug.txt"
# open the debug/log file
debugfile <- file(paste(debugpath, debugname, sep=""), "w")
Other setup:
# setup
maxsites <- 2000
maxsamples <- 9000
miss <- -9999.0
Note that the connection to the particular Access database (GCDv03_Marlon_et_al_2015.mdb
) is established externally to R using (on Windows) the Data Sources tool (i.e. Control Panel > Administrative Tools > Data Sources (ODBC)). On the Mac, the Actual ODBC Driver for Access at http://www.actualtech.com/ works, but requires some configuration, as well as compilation of the RODBC
package. See the code file mdb-to-csv_osx.R
.
Load the RODBC
library and connect to the downloaded database. Get some basic information.
# load RODBC library and connect to the database
library(RODBC)
gcdv3.db <- odbcConnect(dbname)
odbcGetInfo(gcdv3.db)
## DBMS_Name DBMS_Ver Driver_ODBC_Ver
## "ACCESS" "04.00.0000" "03.51"
## Data_Source_Name Driver_Name Driver_Ver
## "GCDv03_Marlon_et_al_2015.mdb" "ACEODBC.DLL" "Microsoft Access database engine"
## ODBC_Ver Server_Name
## "03.80.0000" "ACCESS"
Check that the two queries are present in the database as views, and list their column names. (This is not really necessary, because he presence of the appropriate data can be verified simply by opeining the database.) The data are not in intrinsic tables in the database, but are defined using two queries, and and are externally available as table-like “views” The names of the queries, ALL_BART_SITES
and ALL_BART_DATA
are legacies of the workflow developed for analyzing the first two versions of the database.
# check for existence of database site and data views
sqlTables(gcdv3.db, tableName="ALL_BART_SITES", tableType="VIEW")
## TABLE_CAT TABLE_SCHEM
## 1 e:\\Projects\\GPWG\\GPWGv3\\GCDv3Data\\v3i\\v3i_mdb\\GCDv03_Marlon_et_al_2015.mdb <NA>
## TABLE_NAME TABLE_TYPE REMARKS
## 1 ALL_BART_SITES VIEW <NA>
sqlColumns(gcdv3.db, "ALL_BART_SITES")$COLUMN_NAME
## [1] "ID_SITE" "SITE_NAME" "Expr1" "Expr2" "ELEV"
## [6] "PREF_UNITS" "ID_DEPO_CONTEXT" "ID_STATUS" "1ST_LEVEL"
sqlTables(gcdv3.db, tableName="ALL_BART_DATA", tableType="VIEW")
## TABLE_CAT TABLE_SCHEM
## 1 e:\\Projects\\GPWG\\GPWGv3\\GCDv3Data\\v3i\\v3i_mdb\\GCDv03_Marlon_et_al_2015.mdb <NA>
## TABLE_NAME TABLE_TYPE REMARKS
## 1 ALL_BART_DATA VIEW <NA>
sqlColumns(gcdv3.db, "ALL_BART_DATA")$COLUMN_NAME
## [1] "ID_SITE" "PREF_UNITS" "ID_SAMPLE" "Expr1" "EST_AGE"
## [6] "QUANTITY" "ID_DEPO_CONTEXT" "ID_STATUS" "1ST_LEVEL"
Note that for the particular way the database was constructed, the variables LATITUDE
and LONGITUDE
in the site query and DEPTH
in the data query are not correctly named. Those columns will be renamed after fetching the data.
Use the sqlFetch()
function to get the two tables. For the site query, convert SITE_NAME
from a factor to a character string. Close the database when done.
# site query
site_query <- sqlFetch(gcdv3.db, "ALL_BART_SITES")
names(site_query)[3] <- "LATITUDE"; names(site_query)[4] <- "LONGITUDE"
site_query$SITE_NAME <- as.character(site_query$SITE_NAME)
head(site_query)
## ID_SITE SITE_NAME LATITUDE LONGITUDE ELEV PREF_UNITS ID_DEPO_CONTEXT ID_STATUS 1ST_LEVEL
## 1 1 Cygnet 44.6628 -110.6161 2530 X125 LASE BART INFL
## 2 2 Slough Creek Pond 44.9183 -110.3467 1884 X125 LASE BART INFL
## 3 3 Burnt Knob 45.7044 -114.9867 2250 X125 LASE BART INFL
## 4 4 Baker 45.8919 -114.2619 2300 X125 LASE BART INFL
## 5 5 Hoodoo 46.3206 -114.6503 1770 X125 LASE BART INFL
## 6 6 Pintlar 45.8406 -113.4403 1921 X125 LASE BART INFL
str(site_query)
## 'data.frame': 736 obs. of 9 variables:
## $ ID_SITE : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SITE_NAME : chr "Cygnet" "Slough Creek Pond" "Burnt Knob" "Baker" ...
## $ LATITUDE : num 44.7 44.9 45.7 45.9 46.3 ...
## $ LONGITUDE : num -111 -110 -115 -114 -115 ...
## $ ELEV : num 2530 1884 2250 2300 1770 ...
## $ PREF_UNITS : Factor w/ 132 levels "%125","%DWT",..: 103 103 103 103 103 103 103 103 103 103 ...
## $ ID_DEPO_CONTEXT: Factor w/ 14 levels "ARCH","BOSE",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ ID_STATUS : Factor w/ 1 level "BART": 1 1 1 1 1 1 1 1 1 1 ...
## $ 1ST_LEVEL : Factor w/ 5 levels "C0P0","CONC",..: 3 3 3 3 3 3 3 3 3 3 ...
#data query
data_query <- sqlFetch(gcdv3.db, "ALL_BART_DATA")
names(data_query)[4] <- "DEPTH"
head(data_query)
## ID_SITE PREF_UNITS ID_SAMPLE DEPTH EST_AGE QUANTITY ID_DEPO_CONTEXT ID_STATUS 1ST_LEVEL
## 1 1 X125 1 0.01 46.36 0.931 LASE BART INFL
## 2 1 X125 22 0.02 92.54 1.248 LASE BART INFL
## 3 1 X125 23 0.03 138.55 1.196 LASE BART INFL
## 4 1 X125 24 0.04 184.37 1.069 LASE BART INFL
## 5 1 X125 25 0.05 230.01 0.853 LASE BART INFL
## 6 1 X125 26 0.06 275.47 0.808 LASE BART INFL
str(data_query)
## 'data.frame': 118832 obs. of 9 variables:
## $ ID_SITE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PREF_UNITS : Factor w/ 132 levels "%125","%DWT",..: 103 103 103 103 103 103 103 103 103 103 ...
## $ ID_SAMPLE : int 1 22 23 24 25 26 27 28 29 30 ...
## $ DEPTH : num 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1 ...
## $ EST_AGE : num 46.4 92.5 138.6 184.4 230 ...
## $ QUANTITY : num 0.931 1.248 1.196 1.069 0.853 ...
## $ ID_DEPO_CONTEXT: Factor w/ 14 levels "ARCH","BOSE",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ ID_STATUS : Factor w/ 1 level "BART": 1 1 1 1 1 1 1 1 1 1 ...
## $ 1ST_LEVEL : Factor w/ 5 levels "C0P0","CONC",..: 3 3 3 3 3 3 3 3 3 3 ...
# close the database
odbcClose(gcdv3.db)
Write out two “query” .csv files, one for the sites and one for the data. These may be examined, externally edited, or augmented by additional data.
# site .csv file
sitecsvpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_query/"
write.csv(site_query, paste(sitecsvpath, querysitename, sep=""), row.names=FALSE)
# data .csv file
datacsvpath <- "/Projects/GPWG/GPWGv3/GCDv3Data/v3i/v3i_query/"
write.csv(data_query, paste(datacsvpath, querydataname, sep=""), row.names=FALSE)
Also write a .csv file of site locations and site names for easy editing to make site lists for regional queries:
# rewrite sitefile as .csv file for sorting by region and depositional context
sitelist <- data.frame(site_query$ID_SITE, site_query$LATITUDE, site_query$LONGITUDE, site_query$ELEV,
site_query$ID_DEPO_CONTEXT, site_query$SITE_NAME, stringsAsFactors = FALSE)
names(sitelist) <- c("Site_ID", "Lat", "Lon", "Elev", "depo_context", "Site_Name")
head(sitelist)
## Site_ID Lat Lon Elev depo_context Site_Name
## 1 1 44.6628 -110.6161 2530 LASE Cygnet
## 2 2 44.9183 -110.3467 1884 LASE Slough Creek Pond
## 3 3 45.7044 -114.9867 2250 LASE Burnt Knob
## 4 4 45.8919 -114.2619 2300 LASE Baker
## 5 5 46.3206 -114.6503 1770 LASE Hoodoo
## 6 6 45.8406 -113.4403 1921 LASE Pintlar
str(sitelist)
## 'data.frame': 736 obs. of 6 variables:
## $ Site_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Lat : num 44.7 44.9 45.7 45.9 46.3 ...
## $ Lon : num -111 -110 -115 -114 -115 ...
## $ Elev : num 2530 1884 2250 2300 1770 ...
## $ depo_context: Factor w/ 14 levels "ARCH","BOSE",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ Site_Name : chr "Cygnet" "Slough Creek Pond" "Burnt Knob" "Baker" ...
sitelistfile <- paste(sitelistpath, sitelistname, ".csv", sep="")
write.table(sitelist, sitelistfile, row.names=FALSE, sep=",")
The main part of the script loops over the individual sites, and does various checks and calculations, including
In the example here, only the first site is processed, although the script appears to loop over all sites.
for (j in 1:maxsites) {
Get the data for the j-th site using “square bracket” extraction:
nsamp <- 0
sitedata <- data_query[data_query$ID_SITE == j, ]
nsamp <- length(sitedata$ID_SITE)
The first site, Cygnet L. contains 668 records:
## ID_SITE PREF_UNITS ID_SAMPLE DEPTH EST_AGE QUANTITY ID_DEPO_CONTEXT ID_STATUS 1ST_LEVEL
## 1 1 X125 1 0.01 46.36 0.931 LASE BART INFL
## 2 1 X125 22 0.02 92.54 1.248 LASE BART INFL
## 3 1 X125 23 0.03 138.55 1.196 LASE BART INFL
## 4 1 X125 24 0.04 184.37 1.069 LASE BART INFL
## 5 1 X125 25 0.05 230.01 0.853 LASE BART INFL
## 6 1 X125 26 0.06 275.47 0.808 LASE BART INFL
## ID_SITE PREF_UNITS ID_SAMPLE DEPTH EST_AGE QUANTITY ID_DEPO_CONTEXT ID_STATUS 1ST_LEVEL
## 663 1 X125 691 6.91 16585.48 0.091 LASE BART INFL
## 664 1 X125 692 6.92 16627.40 0.081 LASE BART INFL
## 665 1 X125 693 6.93 16669.50 0.052 LASE BART INFL
## 666 1 X125 694 6.94 16711.76 0.047 LASE BART INFL
## 667 1 X125 695 6.95 16754.19 0.099 LASE BART INFL
## 668 1 X125 696 6.96 16796.80 0.070 LASE BART INFL
Define some local variables, and replace NA
’s with a missing values code.
if (nsamp > 0) {
jchar <- as.character(j)
nsampchar <- as.character(nsamp)
writeLines(paste("Site",jchar,nsampchar,"samples", sep=" "), con = debugfile, sep = "\n")
# local variables
depth <- sitedata$DEPTH; age <- sitedata$EST_AGE; quant <- sitedata$QUANTITY
depth[is.na(depth)] <- miss
age[is.na(age)] <- miss
quant[is.na(quant)] <- miss
thickness <- rep(miss, nsamp); dep_time <- rep(miss, nsamp); sed_rate <- rep(miss, nsamp)
unit_dep_time <- rep(miss, nsamp)
xst_level <- as.character(sitedata[1,9])
The first (top) and last (bottom) samples require some fixing up in order to obtain “thickness” values used to calculate sedimentation rate as sample thickness divided by deposition time. Otherwise, thickness, sedimentation rate and deposition time for each sample are calculated using obvious approaches. Note the inline conversion of depths recorded in the database as metres, to centimetres, the standard devisor in the units of influx.
# sed rate and deposition time
# first (top) sample
if (depth[1] != miss && depth[2] != miss) {
thickness[1] <- (depth[2] - depth[1])*100.0 # meters to cm (depth in m, influx and conc in cm)
dep_time[1] <- age[2] - age[1]
if (dep_time[1] > 0.0) sed_rate[1] <- thickness[1]/dep_time[1]
if (sed_rate[1] != miss) unit_dep_time[1] <- 1.0/sed_rate[1]
}
# samples 2 to nsamp-1
for (i in 2:(nsamp-1)) {
if (depth[1] != miss && depth[2] != miss) {
thickness[i] <- (depth[i+1] - depth[i])*100.0
dep_time[i] <- ((age[i+1] + age[i])/2.0) - ((age[i] + age[i-1])/2.0)
if (dep_time[i] > 0.0) sed_rate[i] <- thickness[i]/dep_time[i]
if (sed_rate[i] != miss) unit_dep_time[i] <- 1.0/sed_rate[i]
}
}
# last (bottom) sample
if (depth[nsamp-1] != miss && depth[nsamp] != miss) {
thickness[nsamp] <- thickness[nsamp-1] # replicate thickness
dep_time[nsamp] <- age[nsamp] - age[nsamp-1]
sed_rate[nsamp] <- sed_rate[nsamp-1] # replicate sed_rate
unit_dep_time[nsamp] <- unit_dep_time[nsamp-1]
}
## age depth thickness dep_time sed_rate unit_dep_time
## [1,] 46.36 0.01 1 46.180 0.02165440 46.180
## [2,] 92.54 0.02 1 46.095 0.02169433 46.095
## [3,] 138.55 0.03 1 45.915 0.02177937 45.915
## [4,] 184.37 0.04 1 45.730 0.02186748 45.730
## [5,] 230.01 0.05 1 45.550 0.02195390 45.550
## [6,] 275.47 0.06 1 45.375 0.02203857 45.375
## age depth thickness dep_time sed_rate unit_dep_time
## [663,] 16585.48 6.91 1 41.840 0.02390057 41.840
## [664,] 16627.40 6.92 1 42.010 0.02380386 42.010
## [665,] 16669.50 6.93 1 42.180 0.02370792 42.180
## [666,] 16711.76 6.94 1 42.345 0.02361554 42.345
## [667,] 16754.19 6.95 1 42.520 0.02351834 42.520
## [668,] 16796.80 6.96 1 42.610 0.02351834 42.520
It’s the case for this site that samples were taken as contiguous 1-cm samples, which explains why thickness values are always equal to 1.0.
Count non-missing values:
# counts of missing values
depth_count <- 0; age_count <- 0; quant_count <- 0; sed_rate_count <- 0; sed_rate_flag <- 1
depth_count <- sum(depth != miss)
age_count <- sum(age != miss)
quant_count <- sum(quant != miss)
sed_rate_count <- sum(sed_rate != miss)
if (sed_rate_count != nsamp) sed_rateflag = 0
## depth_count age_count quant_count sed_rate_count
## [1,] 668 668 668 668
Check for age or depth reversals (sometime natural, but often related to mechanical and/or transcription errors). Because this is analysis of a finished or published data set, no warnings should be generated. In pratice, for “new” versions of a database, several iterations will be requred to clean everything up.
# check for age or depth reversals, and zero or negative sed rates (in nonmissing data)
depth_reversal <- 0; age_reversal <- 0; sed_rate_zeroneg <- 0
for (i in 2:nsamp) {
if (age[i] != miss && age[i-1] != miss && age[i] <= age[i-1]) age_reversal=1
if (depth[i] != miss && depth[i-1] != miss) {
if (depth[i] <= depth[i-1]) depth_reversal=1
}
}
for (i in 2:nsamp) {
if (sed_rate[i] != miss && sed_rate[i] <= 0.0) sed_rate_zeroneg=1
}
## depth_reversal age_reversal sed_rate_zeroneg
## [1,] 0 0 0
Check for various issues like partial records, age reversals, and so on, and write notes into the debug file.
# set and write out various flags
if (depth_count != 0 && depth_count != nsamp) {
writeLines(paste("**** has a missing depth when some are nonmissing", sep=" "), con = debugfile, sep = "\n")
}
if (age_count != 0 && age_count != nsamp) {
writeLines(paste("**** has a missing age when some are nonmissing", sep=" "), con = debugfile, sep = "\n")
}
if (quant_count != 0 && quant_count != nsamp) {
writeLines(paste("**** has a missing quantity when some are nonmissing", sep=" "), con = debugfile, sep = "\n")
}
if (sed_rate_count != 0 && sed_rate_count != nsamp) {
writeLines(paste("**** has a missing sed rate when some are nonmissing", sep=" "), con = debugfile, sep = "\n")
}
if (depth_reversal != 0) {
writeLines(paste("**** has a depth reversal", sep=" "), con = debugfile, sep = "\n")
}
if (age_reversal != 0) {
writeLines(paste("**** has an age reversal", sep=" "), con = debugfile, sep = "\n")
}
if (sed_rate_zeroneg != 0) {
writeLines(paste("**** has zero or negative sed rates", sep=" "), con = debugfile, sep = "\n")
}
The next step is to get alternative quantities for each record, i.e. if the records for a particular site are influx values, get concentrations, and if the records are concentrations, get influx. For charcoal:pollen ratios (C0P0
), treat the data as though they were concentrations. Where no translations are possible, simply copy the data.
# alternative quantities
conc <- rep(miss, nsamp); influx <- rep(miss, nsamp)
influx_source <- rep("none", nsamp) ; conc_source <- rep("none", nsamp)
# select case based on xst_level
if (xst_level == "INFL") # adopt influx values as they are, calculate concentration
{
influx <- quant
influx_source <- "data"
if (influx != miss && unit_dep_time != miss && sed_rate != 0.0) {
conc <- influx * unit_dep_time
conc_source <- "calculated from influx "
} else {
conc <- quant
conc_source <- "copied from quant "
}
writeLines("INFL", con = debugfile, sep = "\n")
}
else if (xst_level == "CONC") # calculate influx, adopt conc values as they are
{
conc <- quant
conc_source <- "data"
if (conc != miss && sed_rate != miss && sed_rate != 0.0) {
influx <- quant * sed_rate
influx_source <- "calculated from conc "
} else {
influx <- quant
influx_source <- "copied from quant "
}
writeLines("CONC", con = debugfile, sep = "\n")
}
else if (xst_level == "C0P0") # assume quantity is concentration like
{
conc <- quant
conc_source <- "C0P0"
if (sed_rate != miss && sed_rate != 0.0) {
influx <- quant * sed_rate
influx_source <- "calculated from C0P0 (conc) "
} else {
influx <- quant
influx_source <- "copied from quant "
}
writeLines("C0P0", con = debugfile, sep = "\n")
}
else if (xst_level == "SOIL") # just copy
{
conc <- quant
conc_source <- "copied from quant "
influx <- quant
influx_source <- "copied from quant "
writeLines("SOIL", con = debugfile, sep = "\n")
}
else if (xst_level == "OTHE") # just copy
{
conc <- quant
conc_source <- "copied from quant "
influx <- quant
influx_source <- "copied from quant "
writeLines("OTHE", con = debugfile, sep = "\n")
}
else
{
conc <- quant
conc_source <- "copied from quant "
influx <- quant
influx_source <- "copied from quant "
writeLines("Unknown", con = debugfile, sep = "\n")
}
}
The results look like the following:
## site0001 id_sample depth est_age sed_rate quant conc influx xst_level conc_source
## 1 1 1 0.01 46.36 0.02165440 0.931 42.99358 0.931 INFL calculated from influx
## 2 2 22 0.02 92.54 0.02169433 1.248 57.52656 1.248 INFL calculated from influx
## 3 3 23 0.03 138.55 0.02177937 1.196 54.91434 1.196 INFL calculated from influx
## 4 4 24 0.04 184.37 0.02186748 1.069 48.88537 1.069 INFL calculated from influx
## 5 5 25 0.05 230.01 0.02195390 0.853 38.85415 0.853 INFL calculated from influx
## 6 6 26 0.06 275.47 0.02203857 0.808 36.66300 0.808 INFL calculated from influx
## influx_source
## 1 data
## 2 data
## 3 data
## 4 data
## 5 data
## 6 data
Check for and report instances where influx values are 0.0 everywhere.
# check for influx == 0.0 everywhere
nzero <- 0
nzero <- sum(influx != 0.0)
if (nzero == 0) {
writeLines(paste("**** has no non-zero influx values", sep=" "), con = debugfile, sep = "\n")
}
Then, for each site, assemble the appropriate data, and write out a .csv file for the current site:
# .csv out
if (nsamp > 0 && nzero > 0) {
# get siteid string
siteidchar <- as.character(j)
if (j >= 1) siteid <- paste("000", siteidchar, sep="")
if (j >= 10) siteid <- paste("00", siteidchar, sep="")
if (j >= 100) siteid <- paste("0", siteidchar, sep="")
if (j >= 1000) siteid <- paste( siteidchar, sep="")
sitehdr <- paste("site", siteid, sep="")
# assemble output data and write it out
samplenum <- seq(1:nsamp)
outdata <- data.frame(samplenum,sitedata$ID_SAMPLE, depth, age, sed_rate, quant, conc,
influx, xst_level, conc_source, influx_source)
names(outdata) <- c(sitehdr, "id_sample", "depth", "est_age", "sed_rate", "quant", "conc",
"influx", "xst_level", "conc_source", "influx_source" )
csvfile <- paste(csvpath,siteid,"_data.csv", sep="")
write.csv(outdata, csvfile, row.names=FALSE)
}
Bottom of loop, close debug file
}
close(debugfile)