diff --git a/.RData b/.RData new file mode 100644 index 00000000..06153a80 Binary files /dev/null and b/.RData differ diff --git a/R/02_genericDefinitions.R b/R/02_genericDefinitions.R index af406817..978e6aaa 100644 --- a/R/02_genericDefinitions.R +++ b/R/02_genericDefinitions.R @@ -42,9 +42,8 @@ setGeneric("showMatch", function(x) { #' @param x sites object #' @returns data.frame with record information at sample level #' @export -setGeneric("samples", function(x) { - standardGeneric(f = "samples") - }) +setGeneric("samples", + function(object, chronname=NULL) standardGeneric("samples")) #' @title Obtain speleothems from a record or multiple records. #' @param x sites object diff --git a/R/get_sites.R b/R/get_sites.R index bee10f6a..62323168 100644 --- a/R/get_sites.R +++ b/R/get_sites.R @@ -139,6 +139,9 @@ get_sites.default <- function(...) { cl <- as.list(match.call()) cl[[1]] <- NULL cl <- lapply(cl, eval, envir = parent.frame()) + if ("gpid" %in% names(cl)) { + cl$gpid = paste(cl$gpid,collapse=",") + } if ("siteid" %in% names(cl)) { # redirect to numeric method if ("all_data" %in% names(cl)) { @@ -159,7 +162,7 @@ get_sites.default <- function(...) { on.exit(options(oo)) baseURL <- paste0("data/sites") result <- tryCatch( - parseURL(baseURL, ...), + do.call(parseURL, c(list(baseURL), cl)), error = function(e) { stop("API call failed: ", e$message) NULL diff --git a/R/parse_site.R b/R/parse_site.R index 97c7c09c..bac24b08 100644 --- a/R/parse_site.R +++ b/R/parse_site.R @@ -94,6 +94,7 @@ parse_site <- function(result, verbose = FALSE) { depositionalenvironment = use_na(y$depositionalenvironment, "char"), defaultchronology = use_na(y$defaultchronology, "int"), + notes = use_na(y$notes,"char"), speleothems = speleothems) do.call(build_collunits, cu_l) }) diff --git a/R/samples.R b/R/samples.R index ee1e0e5d..f6c27254 100644 --- a/R/samples.R +++ b/R/samples.R @@ -3,13 +3,10 @@ #' @author Simon Goring \email{goring@wisc.edu} #' @param x sites object #' @description Obtain all samples within a sites object -#' @examples { -#' tryCatch({ +#' @examples \dontrun{ +#' # Get full data download from API and create a long table with samples data. #' dw <- get_downloads(1) #' pollen <- samples(dw) -#' }, error = function(e) { -#' message("Neotoma server not responding. Try again later.") -#' }) #' } #' @importFrom dplyr bind_rows left_join rename mutate #' @importFrom purrr map @@ -17,147 +14,194 @@ #' @md #' @export setMethod(f = "samples", - signature = "sites", - definition = function(x) { - output <- map(x@sites, function(y) samples(y)) %>% - bind_rows() # %>% - # Handle NAs to allow distinct to work properly - #distinct(.data$sampleid, .keep_all = TRUE) - if (nrow(output) == 0) { - warnsite <- sprintf("No assigned samples. Did you run get_downloads()?") - warning(warnsite) - } - return(output) - } + signature = "sites", + definition = function(object,chronname = NULL) { + output <- map(object@sites, function(y) samples(y, chronname = chronname)) %>% + bind_rows() # %>% + # Handle NAs to allow distinct to work properly + #distinct(.data$sampleid, .keep_all = TRUE) + if (nrow(output) == 0) { + warnsite <- sprintf("No assigned samples. Did you run get_downloads()?") + warning(warnsite) + } + return(output) + } ) #' @rdname samples #' @export setMethod(f = "samples", - signature = "site", - definition = function(x) { - allids <- getids(x) - siteinfo <- as.data.frame(x) %>% - left_join(allids, by = "siteid") - sampset <- map(x@collunits@collunits, - function(y) samples(y)) %>% - bind_rows() %>% - bind_rows() %>% - left_join(siteinfo, by = "datasetid") %>% - rename(sitenotes = .data$notes) - return(sampset) - } + signature = "site", + definition = function(object,chronname = NULL) { + allids <- getids(object) + siteinfo <- as.data.frame(object) %>% + left_join(allids, by = "siteid") + sampset <- map(object@collunits@collunits, + function(y) samples(y, chronname = chronname)) %>% + bind_rows() %>% + bind_rows() %>% + left_join(siteinfo, by = "datasetid") %>% + rename(sitenotes = .data$notes) + return(sampset) + } ) #' @rdname samples #' @export setMethod(f = "samples", - signature = "collunits", - definition = function(x) { - map(x@collunits, function(x) samples(x)) %>% - bind_rows() - } + signature = "collunits", + definition = function(object, chronname = NULL) { + map(object@collunits, function(object) samples(object, chronname=chronname)) %>% + bind_rows() + } ) #' @rdname samples #' @export +#' setMethod(f = "samples", - signature = "collunit", - definition = function(x) { - precedence <- c("Calendar years BP", - "Calibrated radiocarbon years BP", - "Radiocarbon years BP", "Varve years BP") - ids <- getids(x) - # Check the chronologies to make sure everything is okay: - if (length(chronologies(x)) > 0) { - # This pulls the chronology IDs, then applies the Neotoma - # age model precedence (see get_table('agetypes')). - # It returns a value that is larger when your age reporting is - # better. - defaultchron <- map(chronologies(x)@chronologies, - function(y) { - data.frame(chronologyid = y@chronologyid, - isdefault = y@isdefault, - modelagetype = y@modelagetype, - chronologyname = y@chronologyname, - dateprepared = y@dateprepared) - }) %>% - bind_rows() %>% - mutate(modelrank = match(.data$modelagetype, rev(precedence)), - order = .data$isdefault * match(.data$modelagetype, - rev(precedence))) - # Validation of default chrons, we want to check whether there - # exists either multiple default chronologies for the same - # time-frame or, alternately, no default chronology. - all_na <- all(is.na(defaultchron$order)) - max_order <- max(defaultchron$order, na.rm = TRUE) - if (sum(defaultchron$order == max_order, na.rm = TRUE) > 1) { - if (any(is.na(defaultchron$dateprepared))) { - high_chron <- defaultchron$order == max_order - newmax_order <- which.max(defaultchron$chronologyid[high_chron]) - defaultchron$order[high_chron][newmax_order] <- max_order + 1 - } else { - newmax_order <- which.max(defaultchron$dateprepared[ - defaultchron$order == max_order]) - defaultchron$order[defaultchron$order == max_order][ - newmax_order] <- max_order + 1 - } - } - if (all_na == TRUE) { - warnsite <- sprintf("The dataset %s has no default chronologies.", - ids$datasetid[1]) - warning(warnsite) - } else if (sum(defaultchron$order == max_order, na.rm = TRUE) > 1) { - warnsite <- sprintf("The dataset %s has multiple default chronologies. - Chronology %s has been used.", ids$datasetid[1], - defaultchron$chronologyid[ - which.max(defaultchron$order)]) - warning(warnsite) - defaultchron <- defaultchron[which.max(defaultchron$order), ] - } else { - defaultchron <- defaultchron[which.max(defaultchron$order), ] - } - } else { - defaultchron <- data.frame(chronologyid = NULL) - } - sampset <- map(datasets(x)@datasets, - function(y) { - dsid <- y$datasetid - allsamp <- - map(y@samples@samples, - function(z) { - whichage <- - which(z@ages$chronologyid == - defaultchron$chronologyid) - if (length(whichage) == 0) { - whichage <- 1 - } - if (dim(z@datum)[1] > 0) { - df <- - data.frame(z@ages[whichage,], - z@datum, - analysisunitid = z@analysisunitid, - sampleanalyst = - toString(unique(unlist( - z@sampleanalyst, - use.names = FALSE))), - sampleid = z@sampleid, - depth = z@depth, - thickness = z@thickness, - samplename = z@samplename, - row.names = NULL) - } else { - df <- data.frame() - } - return(df) + signature = "collunit", + definition = function(object, chronname) { + chron_exists <- FALSE + precedence <- c("Calendar years BP", + "Calibrated radiocarbon years BP", + "Radiocarbon years BP", "Varve years BP") + ids <- getids(object) + # Check the chronologies to make sure everything is okay: + if (length(chronologies(object)) > 0) { + # This pulls the chronology IDs, then applies the Neotoma + # age model precedence (see get_table('agetypes')). + # It returns a value that is larger when your age reporting is + # better. + defaultchron <- map(chronologies(object)@chronologies, + function(y) { + data.frame(chronologyid = y@chronologyid, + isdefault = y@isdefault, + modelagetype = y@modelagetype, + chronologyname = y@chronologyname, + dateprepared = y@dateprepared) + }) %>% + bind_rows() %>% + mutate(modelrank = match(.data$modelagetype, rev(precedence)), + order = .data$isdefault * match(.data$modelagetype, + rev(precedence))) + #print(defaultchron) + #print(unique(defaultchron$modelagetype)) + #print(unique(defaultchron$isdefault)) + # Validation of default chrons, we want to check whether there + # exists either multiple default chronologies for the same + # time-frame or, alternately, no default chronology. + all_na <- all(is.na(defaultchron$order)) + #print(all_na) + #print(defaultchron$order) + max_order <- max(defaultchron$order, na.rm = TRUE) + if (sum(defaultchron$order == max_order, na.rm = TRUE) > 1) { + if (any(is.na(defaultchron$dateprepared))) { + high_chron <- defaultchron$order == max_order + newmax_order <- which.max(defaultchron$chronologyid[high_chron]) + defaultchron$order[high_chron][newmax_order] <- max_order + 1 + } else { + newmax_order <- which.max(defaultchron$dateprepared[ + defaultchron$order == max_order]) + defaultchron$order[defaultchron$order == max_order][ + newmax_order] <- max_order + 1 + } + } + if (all_na == TRUE) { + warnsite <- sprintf("The dataset %s has no default chronologies.", + ids$datasetid[1]) + warning(warnsite) + + } + + chron_exists <- !is.null(chronname) && + chronname %in% defaultchron$chronologyname + + if (all_na == TRUE) { + + warnsite <- sprintf("The dataset %s has no default chronologies.", + ids$datasetid[1]) + warning(warnsite) + + } else if (chron_exists) { + + # chronology explicitly requested and exists + matches <- defaultchron$chronologyname == chronname + + if (sum(matches, na.rm = TRUE) == 1) { + defaultchron <- defaultchron[matches, ] + } + + if (sum(matches, na.rm = TRUE) > 1) { + defaultchron <- defaultchron[matches, ][1,] + warnsite <- sprintf( + "The dataset %s has multiple chronologies matching chronname %s. Chronology %s has been used.", + ids$datasetid[1], chronname, defaultchron$chronologyid) + warning(warnsite) + } + + } else { + + # either no chronname OR chronname invalid + defaultchron <- defaultchron[which.max(defaultchron$order), ] + + if (!is.null(chronname)) { + warnsite <- sprintf( + "The dataset %s has no chronology matching chronname %s. Default chronology %s has been used.", + ids$datasetid[1], chronname, defaultchron$chronologyid) + warning(warnsite) + } + + } + } else { + defaultchron <- data.frame(chronologyid = NULL) + } + #print(paste0("default chron: ", defaultchron)) + sampset <- map(datasets(object)@datasets, + function(y) { + dsid <- y$datasetid + allsamp <- + map(y@samples@samples, + function(z) { + whichage <- + which(z@ages$chronologyid == + defaultchron$chronologyid) + if (length(whichage) == 0) { + if (chron_exists) { + whichage <- NA + } else { + whichage <- 1 + } + } + if (dim(z@datum)[1] > 0) { + df <- + data.frame(z@ages[whichage,], + z@datum, + analysisunitid = z@analysisunitid, + sampleanalyst = + toString(unique(unlist( + z@sampleanalyst, + use.names = FALSE))), + sampleid = z@sampleid, + depth = z@depth, + thickness = z@thickness, + samplename = z@samplename, + row.names = NULL) + #print(z$ages) + } else { + df <- data.frame() + } + return(df) + }) %>% + bind_rows() %>% + mutate(datasetid = dsid) + return(allsamp) }) %>% - bind_rows() %>% - mutate(datasetid = dsid) - return(allsamp) - }) %>% - bind_rows() %>% - left_join(as.data.frame(datasets(x)), by = "datasetid") %>% - rename(datasetnotes = .data$notes) - return(sampset) - } + bind_rows() %>% + left_join(as.data.frame(datasets(object)), by = "datasetid") %>% + rename(datasetnotes = .data$notes) + if (!is.null(chronname) && sum(defaultchron$chronologyname == chronname, na.rm = TRUE) != 0) { + sampset = sampset %>% dplyr::filter(!is.na(chronologyid))} + return(sampset) + } ) \ No newline at end of file diff --git a/inst/doc/neotoma2-package.html b/inst/doc/neotoma2-package.html deleted file mode 100644 index 2c939b7d..00000000 --- a/inst/doc/neotoma2-package.html +++ /dev/null @@ -1,5198 +0,0 @@ - - - - -
- - - - - - - - - -The Neotoma Paleoecology
-Database is a domain-specific data resource containing millions of
-fossil records from around the globe, covering the last 5.4 million
-years. The neotoma2 R package simplifies some of the data
-structures and concepts to facilitate statistical analysis and
-visualization. Users may wish to gain a deeper understanding of the
-resource itself, or build more complex data objects and relationships.
-For those users a partial list is provided here, including a table of
-code examples focusing on different geographic regions, languages and
-dataset types.
Data in Neotoma is associated with sites, specific locations with -lat/long coordinates. Within a site, there may be one or more collection -units – locations at which samples are physically collected -within the site. For example, an archaeological site -may have one or more collection units, pits within a -broader dig site; a pollen sampling site on a lake may -have multiple collection units – core sites within the -lake basin. Collection units may have higher resolution GPS locations, -but are considered to be part of the broader site. Within a -collection unit data is collected at various -[analysis units] from which samples -are obtained.
-Because Neotoma is made up of a number of constituent databases -(e.g., the Indo-Pacific Pollen Database, NANODe, FAUNMAP), a set of -samples associated with a collection -unit are assigned to a single dataset -associated with a particular dataset type (e.g., -pollen, diatom, vertebrate fauna) and constituent -database.
-Researchers often begin by searching for sites within a particular -study area, whether that is defined by geographic or political -boundaries. From there they interrogate the available datasets for their -particular dataset type of interest. When they find records of interest, -they will then often call for the data and associated chronologies.
-The neotoma2 R package is intended to act as the
-intermediary to support these research activities using the Neotoma
-Paleoecology Database. Because R is not a relational database, we needed
-to modify the data structures of the objects. To do this the package
-uses a set of S4 objects to represent different elements within the
-database.
neotoma2 R package, and the way the elements are
-related to one another. Individual boxes represent the major classes
-(sites, site, collectionunits, etc.). Each box then has a list of the
-specific metadata contained within the class, and the variable type
-(e.g., siteid: integer). Below these are the functions that can
-be applied to the object (e.g., [[<-).It is important to note, here and elsewhere: Almost
-everything you will interact with is a sites
-object. A sites object is the general currency of
-this package. sites may have more or less metadata
-associated with them, but they are the primary object, and, as you can
-see in the diagram above, they have the most functions associated with
-them.
The earlier neotoma package tried to use base R as much
-as possible. The neotoma2 package now draws primarily on
-dplyr and purrr packages from the
-tidyverse, and on the sf spatial data package.
-The choice to integrate tidyverse packages was made largely
-because of the current ubiquity of the tidyverse in R
-education.
The highest level object in Neotoma is the site. -Sites have spatial coordinates and, in many cases, additional metadata -related to lake parameters, or other site-specific properties.
-Sites can be searched using the get_sites() function,
-or, can be created using the set_site() function. A single
-site object is a special object in R, that can be combined
-with other sites into a sites object. A sites
-object is effectively a list() of site objects
-with special methods for printing, plotting and exporting
-information.
All sites in Neotoma have a unique numeric identifier. With the
-neotoma2 package you can search for a site using the
-get_sites() function by its unique site id
-(siteid), by name (sitename), by altitude
-(altmin, altmax), by geopolitical name
-(gpid), location (loc) or age bounds.
If we’re looking for a site and we know its specific identifier, we
-can use the simplest implementation of get_sites(). Here we
-are searching for a site (Alexander Lake), where we know that the siteid
-for the record in Neotoma is 24. We can get these siteids
-using the Neotoma
-Explorer web application, or if we have some familiarity with the
-site records already.
# Search for site by a single numeric ID:
-alex <- get_sites(24)
-alex
-#> siteid sitename lat long altitude
-#> 24 Alexander Lake 53.33333 -60.58333 73
-
-# Search for sites with multiple IDs using c():
-multiple_sites <- get_sites(c(24, 47))
-multiple_sites
-#> siteid sitename lat long altitude
-#> 24 Alexander Lake 53.33333 -60.58333 73
-#> 47 Liberty 43.52000 -90.78000 353Once you search for a site, the neotoma2 R package makes
-a call to the Neotoma Database, and returns a structured
-sites object that contains metadata about the sites, and
-some additional metadata about collection units and datasets at those
-sites. This limited metadata helps speed up further searches, but is not
-complete, for the purposes of analysis.
get_sites() call, is a sites object containing
-two individual site objects. Each site object
-contains a collunits object with some limited metadata. The
-top site appears to have two collection units, while the
-lower site has only a single collection unit. Each of the top two
-collection units appear to contain two datasets, while the bottom site
-has only the one collection unit with only one dataset.Often we do not know the particular siteid. If we’re
-looking for a site and we know its name or a part of its name, we can
-search using the function with the sitename argument,
-get_site(sitename = 'XXX'), where 'XXX' is the
-site name. This does not support multiple text strings (i.e., you can’t
-use c()).
alex <- get_sites(sitename = "Alexander Lake")
-alex
-#> siteid sitename lat long altitude
-#> 24 Alexander Lake 53.33333 -60.58333 73Neotoma uses a Postgres Database to manage data. Postgres uses the
-% sign as a general wildcard, so we can use the
-% in the sitename argument operator to help us
-find sites when we’re not sure the exact match. Note that the search is
-case insensitive so a search for alex% or
-Alex% will return the same results.
alex <- get_sites(sitename = 'Alex%')
-alex
-#> siteid sitename lat long altitude
-#> 24 Alexander Lake 53.33333 -60.58333 73
-#> 25 Alexis Lake 52.51667 -57.03333 193
-#> 4478 Alexander [3CN117] 35.25000 -92.61667 180
-#> 26226 Alexandra Lake 43.29030 -74.16966 351Since this new sites object has 4. elements that belong
-to site, we may want to access only one of the objects, or
-sets of metadata from our variable alex.
There are several ways of searching for sites using age parameters. -These are represented below:
-ageof, ageyoung, ageold,
-maxage and minage.We offer several methods of searching because different users have
-different requirements. A user might be only interested in one specific
-point in time in the past, for example the 8.2ka event. In this instance
-they would search get_sites(ageof = 8200). They may want
-sites with records that completely span a time period, for example the
-Atlantic chronozone of the Holocene:
-get_sites(ageyounger = 5000, ageolder = 8000). These sites
-would have samples both within and outside the defined age range, so
-that the user could track change into and out of the time period. A user
-may also be interested in any record within a time bin, regardless of
-whether the site spans that time zone or not. They would query
-get_sites(minage = 5000, maxage = 8000).
We can see how these age bounds differ:
-# Note, we are using the `all_data = TRUE` flag here to avoid the default limit of 25 records, discussed below.
-# Because these queries are searching through every record they are slow and and are not
-# run in knitting this vignette.
-get_sites(ageof = 8200, all_data = TRUE) %>% length()
-get_sites(ageyounger = 5000, ageolder = 8000, all_data = TRUE) %>% length()
-get_sites(minage = 5000, maxage = 8000, all_data = TRUE) %>% length()It is possible to pass all parameters (ageof,
-minage, maxage, ageyounger, . . .
-), but it is likely that these will conflict and result in an empty set
-of records. To avoid this, be aware of the relationships among these
-search parameters, and how they might affect your search window.
sites metadataAlthough the sites are structured using S4 objects (see
-Hadley Wickham’s S4
-documentation), we’ve added helper functions to make accessing
-elements easier for users.
The alex object is composed of several smaller objects
-of class site. We can call any individual site using
-[[ ]], placing the index of the desired object between the
-brackets. Then we can also call the particular variable we want using
-the $ symbol.
The elements within a site are the same as the defined
-columns within the Neotoma ndb.sites
-table, with the exception of the collunits slot, which
-contains the collection units and associated datasets that are found
-within a site. You can see all the site slots using the
-names() function. You can select individual elements of a
-site, and you can assign values to these parameters:
names(alex[[1]])
-#> [1] "siteid" "sitename" "geography" "altitude" "geopolitical"
-#> [6] "area" "notes" "description" "collunits"
-
-# Modify a value using $<- assignment:
-alex[[1]]$area
-#> [1] NA
-alex[[1]]$area <- 100
-alex[[1]]$area
-#> [1] 100
-
-# Modify a value using [<- assignment:
-alex[[1]]["area"] <- 30
-alex[[1]]$area
-#> [1] 30
-# alex[[1]][7] <- 30 This fails because the `Notes` field expects a character string.Using assignment, we can add information programmatically, for -example, by working interactively with a digital elevation model or -hydrographic data to obtain lake area measurements. Although not -currently implemented, the goal is to support direct upload of updated -information by users.
-As explained above, a site is the fundamental unit of
-the Neotoma Database. If you are working with your own data, you might
-want to create a site object to allow it to interact with
-other data within Neotoma. You can create a site with the
-set_site() function. It will ask you to provide important
-information such as sitename, lat, and
-long attributes.
my_site <- set_site(sitename = "My Lake",
- geography = st_sf(a = 3, st_sfc(st_point(1:2))),
- description = "my lake",
- altitude = 30)
-my_site
-#> siteid sitename lat long altitude
-#> 35960541 My Lake 2 1 30If we have a set of sites that we are analyzing, we can add the new
-site to the set of sites, either by appending it to the end, using
-c(), or by replacing a particular element using
-[[<-.
This method allows us to begin modifying site information for -existing sites if we have updated knowledge about site properties.
-# Add a new site that's been edited using set_site()
-longer_alex <- c(alex, my_site)
-# Or replace an element within the existing list of sites
-# with the newly created site.
-longer_alex[[2]] <- my_site
-
-# Or append to the `sites` list with assignment:
-longer_alex[[3]] <- my_siteWe can also use set_sites() as a tool to update the
-metadata associated with an existing site object:
If you need to get to a deeper level of the sites object, you may
-want to look at the get_datasets() function. You can use
-get_datasets() using search parameters, or you can use it
-on an existing sites object, such as our prior
-alex dataset.
get_datasets() adds additional metadata to the
-site objects, letting us know which
-datasettypes are associated with a site, and the dataset
-sample locations at the site.
get_datasets() provides more
-complete metadata about a record, including the addition of
-chronological information, and more complete metadata about the
-datasets, compared to the get_sites() call, shown above.
-The objects here are the same as above, but now have chronology
-metadata, and contact metadata for the records. Note that there is still
-no sample or taxonomic information about these records. This comes from
-the get_downloads() function.Getting the datasets by id is the easiest call, you can also pass a
-vector of IDs or, if you already have a sites object, you
-can pass a sites object.
# Getting datasets by ID
-my_datasets <- get_datasets(c(5, 10, 15, 20))
-my_datasets
-#> siteid sitename lat long altitude
-#> 5 17/2 55.25000 -74.93333 300
-#> 10 Site 1 (Cohen unpublished) 30.83000 -82.33000 36
-#> 15 Aguilar -23.83333 -65.75000 3828
-#> 20 Akuvaara 69.12326 27.67406 159You can also retrieve datasets by type directly from the API.
-# Getting datasets by type
-my_pollen_datasets <- get_datasets(datasettype = "pollen", limit = 25)
-my_pollen_datasets
-#> siteid sitename lat long altitude
-#> 7 Three Pines Bog 47.00000 -80.11667 329
-#> 8 Abalone Rocks Marsh 33.95639 -119.97667 9
-#> 9 Adange 43.30556 41.33333 2065
-#> 11 Konus Exposure, Adycha River 67.75000 135.58333 137
-#> 12 Ageröds Mosse 55.93329 13.42559 47
-#> 13 Aguas Calientes -23.08333 -67.40000 4233
-#> 14 Aguas Calientes 2 -23.50000 -67.58333 4198
-#> 15 Aguilar -23.83333 -65.75000 3828
-#> 16 Ahlenmoor 53.69908 8.74688 5
-#> 17 Ajata -18.25000 -69.20000 4773
-#> 18 South Soefje Bog 29.60000 -97.51694 100
-#> 19 Akulinin Exposure P1282 47.11667 138.55000 367
-#> 20 Akuvaara 69.12326 27.67406 159
-#> 21 Alazeya River Exposure, 8 m Terrace 68.50000 154.50000 50
-#> 22 Alazeya River Exposure, 9 m Terrace 64.33333 154.50000 125
-#> 24 Alexander Lake 53.33333 -60.58333 73
-#> 25 Alexis Lake 52.51667 -57.03333 193
-#> 27 Aliuk Pond 54.58333 -57.36667 9
-#> 29 Lake Allie 44.80156 -94.55982 320
-#> 30 Almora Lake 46.20611 -95.29361 437
-#> 31 Alut Lake 60.13667 152.31278 488
-#> 32 Amarete -15.23333 -68.98333 3755
-#> 33 Amba River Exposure 596 43.31667 131.81667 0
-#> 68 Amguema River Valley Exposure 1 67.75000 178.70000 493
-#> 69 Amguema River Valley Exposure 2 67.66667 178.60000 376It can be computationally intensive to obtain the full set of records
-for sites or datasets. By default the
-limit for all queries is 25. The default
-offset is 0. To capture all results we can use
-the all_data = TRUE flag in our calls.
-However, this is hard on the Neotoma servers. We tend
-to prefer that users use all_data = TRUE once their
-analytic workflow is mostly complete.
We can use that all_data = TRUE in R in the following
-way:
allSites_dt <- get_sites(datasettype = "diatom")
-allSites_dt_all <- get_sites(datasettype = "diatom", all_data = TRUE)
-
-# Because we used the `all_data = TRUE` flag, there will be more sites
-# in allSites_dt_all, because it represents all sites containing diatom datasets.
-length(allSites_dt_all) > length(allSites_dt)You can get the coordinates to create a GeoJson bounding box from here, or you can use
-pre-existing objects within R, for example, country-level data within
-the spData package:
Accessing datasets by bounding box:
-brazil <- '{"type": "Polygon",
- "coordinates": [[
- [-73.125, -9.102],
- [-56.953, -33.138],
- [-36.563, -7.711],
- [-68.203, 13.923],
- [-73.125, -9.102]
- ]]}'
-
-# We can make the geojson a spatial object if we want to use the
-# functionality of the `sf` package.
-brazil_sf <- geojsonsf::geojson_sf(brazil)
-
-
-brazil_datasets <- get_datasets(loc = brazil_sf)Now we have an object called brazil_datasets that
-contains 19.
You can plot these findings!
- - - -Sometimes we take a large number of records, do some analysis, and
-then choose to select a subset. For example, we may want to select all
-sites in a region, and then subset those by dataset type. If we want to
-look at only the geochronological datasets from Brazil, we can start
-with the set of records returned from our get_datasets()
-query, and then use the filter function in
-neotoma2 to select only those datasets that are
-geochronologic:
-brazil_dates <- neotoma2::filter(brazil_datasets,
- datasettype == "geochronologic")
-
-# or:
-brazil_dates <- brazil_datasets %>%
- neotoma2::filter(datasettype == "geochronologic")
-
-# With boolean operators:
-brazil_space <- brazil_datasets %>% neotoma2::filter(lat > -18 & lat < -16)The filter() function takes as the first argument, a
-datasets object, followed by the criteria we want to use to filter.
-Current supported criteria includes:
latlongelevdatasettypeYou also need to make sure that you accompany any of these terms with
-the following boolean operators: <, > or
-==, !=. datasettype has to be of
-type string, while the other terms must be numeric. If you need to
-filter by the same argument, let’s say, you need to filter
-“geochronologic” and “pollen data types, then you will also make use of
-& and | operators.
Once we have the set of records we wish to examine, we then want to
-recover the actual sample data. This will provide us with information
-about the kinds of elements found at the site, within the dataset, their
-sample ages, and their counts or measurements. To do this we use the
-get_downloads() call. Note, as before, we are returning a
-sites objects, but this time with the most complete
-metadata.
get_downloads() returns a
-sites object, but one that contains dataset objects with
-filled samples slots. The samples slot is
-often very large relative to the other metadata associated with
-sites, and so it is commonly held back until a direct
-request is provided. Helper functions at the sites level
-can pull out sample data once get_downloads()
-has been called.Assuming we continue with our example from Brazil, we want to extract -records from the country, filter to only pollen records with samples -covering the last 10,000 years, and then look at the relative frequency -of taxa across sites. We might do something like this:
-brazil <- '{"type": "Polygon",
- "coordinates": [[
- [-73.125, -9.102],
- [-56.953, -33.138],
- [-36.563, -7.711],
- [-68.203, 13.923],
- [-73.125, -9.102]
- ]]}'
-
-# We can make the geojson a spatial object if we want to use the
-# functionality of the `sf` package.
-brazil_sf <- geojsonsf::geojson_sf(brazil)
-
-brazil_records <- get_datasets(loc = brazil_sf, all_data=TRUE) %>%
- neotoma2::filter(datasettype == "pollen" & age_range_young <= 1000 & age_range_old >= 10000) %>%
- get_downloads()
-
-
-count_by_site <- samples(brazil_records) %>%
- dplyr::filter(elementtype == "pollen" & units == "NISP") %>%
- group_by(siteid, variablename) %>%
- summarise(n = n()) %>%
- group_by(variablename) %>%
- summarise(n = n()) %>%
- arrange(desc(n))
-#> `summarise()` has grouped output by 'siteid'. You can override using the
-#> `.groups` argument.In this code chunk we define the bounding polygon for our sites,
-filter by time and dataset type, and then return the full records for
-those sites. We get a sites object with dataset and sample
-information (because we used get_downloads()). We execute
-the samples() function to extract all the samples from the
-sites objects, and then filter the resulting
-data.frame to pull only pollen (a pollen dataset may
-contain spores and other elements that are not, strictly speaking,
-pollen) that are counted using the number of identified specimens (or
-NISP). We then group_by() the unique site identifiers
-(siteid) and the taxa (variablename) to get a
-count of the number of times each taxon appears in each site. We then
-want to summarize() to a higher level, just trying to
-understand how many sites each taxon appears in. After that we
-arrange() so that the records show the most common taxa
-first in the resulting variable count_by_site.
Many Neotoma records have publications associated with them. The
-publication object (and the publications
-collection) provide the opportunity to do this. The publication
-table in Neotoma contains an extensive number of fields. The methods for
-publications in the neotoma2 package provide
-us with tools to retrieve publication data from Neotoma, to set and
-manipulate publication data locally, and to retrieve publication data
-from external sources (e.g., using a DOI).
get_publications() from NeotomaThe most simple case is a search for a publication based on one or -more publication IDs. Most people do not know the unique publication ID -of individual articles, but this provides a simple method to highlight -the way Neotoma retrieves and presents publication information.
-We can use a single publication ID or multiple IDs. In either case
-the API returns the publication(s) and creates a new
-publications object (which consists of multiple individual
-publications).
From there we can then then subset and extract elements from the list
-using the standard [[ format. For example:
Will return the second publication in the list, corresponding to the
-publication with publicationid 14 in this case.
We can also use search elements to search for publications. The
-get_publication method uses the Neotoma API to search for
-publications. We can search using the following properties:
publicationiddatasetidsiteidfamilynamepubtypeyearsearchlimitoffsetThis results in a set of 2 publications from Neotoma, equal to the
-limit. If the number of matching publications is less than
-the limit then the length() will be smaller.
Text matching in Neotoma is approximate, meaning it is a measure of -the overall similarity between the search string and the set of article -titles. This means that using a nonsense string may still return results -results:
- -This returns a result set of length 5.
-This returns the (Neotoma) ID, the citation and the publication DOI
-(if that is stored in Neotoma). We can get the first publication using
-the standard [[ nomenclature:
The output will look similar to the output for two
-above, however you will see that only a single result will be returned
-and the class (for a single publication) will be of type
-publication (as opposed to publications).
We can select an array of publication objects using the
-[[ method, either as a sequence (1:10, or as a
-numeric vector (c(1, 2, 3))):
Just as we can use the set_sites() function to set new
-site information, we can also create new publication information using
-set_publications(). With set_publications()
-you can enter as much or as little of the article metadata as you’d
-like, but it’s designed (in part) to use the CrossRef API to return
-information from a DOI.
new_pub <- set_publications(
-articletitle = "Myrtle Lake: a late- and post-glacial pollen diagram from northern Minnesota",
-journal = "Canadian Journal of Botany",
-volume = 46)A publication has a large number of slots that can be
-defined. These may be left blank, they may be set directly after the
-publication is defined: