Reproducible Cached Retrieval and Batching

This vignette shows reproducible high-throughput retrieval using:

  • pc_config() for centralized policy.
  • pc_request(..., cache = TRUE) deterministic caching.
  • pc_batch() chunked execution.
library(PubChemR)
library(tibble)
library(dplyr)

Configure defaults

cfg <- pc_config(
  rate_limit = 5,
  retries = 3,
  timeout = 60
)
cfg
#> $rate_limit
#> [1] 5
#> 
#> $timeout
#> [1] 60
#> 
#> $retries
#> [1] 3
#> 
#> $pause_base
#> [1] 1
#> 
#> $pause_cap
#> [1] 8
#> 
#> $user_agent
#> [1] "PubChemR/3.0.0"
#> 
#> $cache_dir
#> [1] "/tmp/RtmpzUrX9P/PubChemR_cache"
#> 
#> $cache_ttl
#> [1] 86400
#> 
#> $offline
#> [1] FALSE

Clear cache for a clean run

pc_cache_clear()

Cached property retrieval

first <- pc_property(
  identifier = c(2244, 3672, 2519),
  properties = c("MolecularWeight", "XLogP", "TPSA"),
  namespace = "cid",
  cache = TRUE
)

second <- pc_property(
  identifier = c(2244, 3672, 2519),
  properties = c("MolecularWeight", "XLogP", "TPSA"),
  namespace = "cid",
  cache = TRUE
)

# second$from_cache is expected to be TRUE
as_tibble(second)

Chunked batch execution

ids <- c("aspirin", "ibuprofen", "caffeine", "acetaminophen", "naproxen")

batch <- pc_batch(
  ids = ids,
  fn = function(chunk_ids) {
    pc_identifier_map(
      identifier = chunk_ids,
      namespace = "name",
      to = "cids",
      domain = "compound",
      cache = TRUE
    )
  },
  chunk_size = 2
)

batch
as_tibble(batch)