Advanced Next-Generation Workflows in PubChemR

This vignette covers advanced features that are not the focus of the legacy get_*/PUG REST/PUG View tutorials. It focuses on resilient, typed, and high-throughput pc_* workflows:

Typed result contracts and error handling.
Unified request layer and endpoint wrappers.
Caching, offline replay, and runtime policy control.
Async listkey workflows.
Checkpointed batching and resume patterns.
Benchmark harness and threshold-gated performance checks.
Analysis-layer helpers, sparse matrices, and model export.
Optional cheminformatics bridges and lifecycle policy.

library(PubChemR)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tibble)

1) Typed result contract

The pc_* layer returns structured objects (PubChemResult, PubChemBatchResult, PubChemBenchmarkReport) with explicit success/error fields.

# Deterministic local payload -> typed successful result:
ok_text <- '{"PropertyTable":{"Properties":[{"CID":2244,"MolecularWeight":180.16}]}}'
res <- pc_response(
  ok_text,
  request = list(
    domain = "compound",
    namespace = "cid",
    identifier = 2244,
    operation = "property/MolecularWeight",
    output = "JSON"
  )
)

res$success
#> [1] TRUE
as_tibble(res)
#> # A tibble: 1 × 6
#>   success status from_cache pending   CID MolecularWeight
#>   <lgl>    <int> <lgl>      <lgl>   <dbl>           <dbl>
#> 1 TRUE        NA FALSE      FALSE    2244            180.

You can also normalize raw payload text into the same typed structure:

fault_text <- '{"Fault":{"Code":"PUGREST.BadRequest","Message":"Invalid request"}}'
fault_res <- pc_response(fault_text, request = list(url = "https://example.org"))
fault_res$success
#> [1] FALSE
fault_res$error$code
#> [1] "PUGREST.BadRequest"

2) Runtime policy and transport controls

Use pc_profile() for common policies, then override with pc_config().

pc_profile("default")
#> $rate_limit
#> [1] 5
#> 
#> $timeout
#> [1] 60
#> 
#> $retries
#> [1] 3
#> 
#> $pause_base
#> [1] 1
#> 
#> $pause_cap
#> [1] 8
#> 
#> $user_agent
#> [1] "PubChemR/3.0.0"
#> 
#> $cache_dir
#> [1] "/tmp/RtmpFNZ8uW/PubChemR_cache"
#> 
#> $cache_ttl
#> [1] 86400
#> 
#> $offline
#> [1] FALSE
cfg <- pc_config(
  rate_limit = 5,
  timeout = 60,
  retries = 3,
  pause_base = 1,
  pause_cap = 8,
  cache_ttl = 24 * 60 * 60
)
cfg
#> $rate_limit
#> [1] 5
#> 
#> $timeout
#> [1] 60
#> 
#> $retries
#> [1] 3
#> 
#> $pause_base
#> [1] 1
#> 
#> $pause_cap
#> [1] 8
#> 
#> $user_agent
#> [1] "PubChemR/3.0.0"
#> 
#> $cache_dir
#> [1] "/tmp/RtmpFNZ8uW/PubChemR_cache"
#> 
#> $cache_ttl
#> [1] 86400
#> 
#> $offline
#> [1] FALSE

Input validation is strict for chunk sizes, worker counts, and numeric controls.

try(pc_config(rate_limit = 0), silent = TRUE)
try(pc_request(cache_ttl = Inf, offline = TRUE), silent = TRUE)

3) Unified request layer and typed wrappers

pc_request() is the generic entry point. Wrappers provide typed intent.

# Generic request
r0 <- pc_request(
  domain = "compound",
  namespace = "cid",
  identifier = 2244,
  operation = "property/MolecularWeight,XLogP",
  output = "JSON"
)

# Domain-oriented wrappers
cmp <- pc_compound(identifier = 2244, namespace = "cid", operation = "record")
sub <- pc_substance(identifier = 2244, namespace = "cid", operation = "record")
asy <- pc_assay(identifier = 1000, namespace = "aid", operation = "summary")
prp <- pc_property(identifier = c(2244, 3672), properties = c("MolecularWeight", "XLogP"))
idm <- pc_identifier_map(identifier = "aspirin", namespace = "name", to = "cids")

as_tibble(prp)
as_tibble(idm)

4) Caching and offline replay

PubChemR supports memory+disk caching, diagnostics, and cache-only replay mode.

pc_cache_clear()

first <- pc_property(
  identifier = 2244,
  properties = c("MolecularWeight", "XLogP"),
  namespace = "cid",
  cache = TRUE
)

second <- pc_property(
  identifier = 2244,
  properties = c("MolecularWeight", "XLogP"),
  namespace = "cid",
  cache = TRUE
)

second$from_cache
pc_cache_info()

Offline replay mode is useful for deterministic pipelines:

offline_miss <- pc_request(
  domain = "compound",
  namespace = "cid",
  identifier = 2244,
  output = "JSON",
  cache = TRUE,
  offline = TRUE
)

offline_miss$success
#> [1] FALSE
offline_miss$error$code
#> [1] "OfflineCacheMiss"

5) Async listkey workflows (`pc_submit` -> `pc_poll` -> `pc_collect`)

For asynchronous queries, use:

q <- pc_submit(
  domain = "compound",
  namespace = "name",
  identifier = "aspirin",
  searchtype = "similarity",
  operation = "cids",
  output = "JSON",
  options = list(Threshold = 95, MaxRecords = 200)
)

# Option A: explicit polling
final_a <- pc_poll(q, interval = 1.5, max_attempts = 20)

# Option B: convenience collector
final_b <- pc_collect(q, interval = 1.5, max_attempts = 20)

as_tibble(final_b)

6) Checkpointed batching and resume

pc_batch() supports chunking, optional parallelism, and resumable checkpoints.

ids <- c("aspirin", "ibuprofen", "caffeine", "acetaminophen", "naproxen")
cp_dir <- file.path(tempdir(), "pc_batch_checkpoint")

batch <- pc_batch(
  ids = ids,
  fn = function(chunk_ids) {
    pc_identifier_map(
      identifier = chunk_ids,
      namespace = "name",
      to = "cids",
      domain = "compound",
      cache = TRUE
    )
  },
  chunk_size = 2,
  checkpoint_dir = cp_dir,
  checkpoint_id = "name_to_cid_demo"
)

batch
as_tibble(batch)

Resume from existing checkpoint state:

resumed <- pc_resume_batch(
  fn = function(chunk_ids) {
    pc_identifier_map(
      identifier = chunk_ids,
      namespace = "name",
      to = "cids",
      domain = "compound",
      cache = TRUE
    )
  },
  checkpoint_dir = cp_dir,
  checkpoint_id = "name_to_cid_demo"
)

resumed$checkpoint$resumed
as_tibble(resumed)

7) Benchmarking and threshold gates

pc_benchmark() compares chunk and parallel scenarios.
pc_benchmark_harness() scales this into scenario gates (10/1k/100k by default).

probe <- function(ids) {
  pc_request(
    domain = "compound",
    namespace = "cid",
    identifier = ids,
    operation = "property/MolecularWeight",
    output = "JSON",
    cache = FALSE
  )
}

bm <- pc_benchmark(
  ids = rep(2244, 200),
  fn = probe,
  chunk_sizes = c(25, 50, 100),
  parallel_options = c(FALSE)
)
bm

thresholds <- list(
  elapsed_sec = c(`10` = 30, `1000` = 300, `100000` = 3600),
  failed_chunk_ratio = c(`10` = 0, `1000` = 0.01, `100000` = 0.05)
)

h <- pc_benchmark_harness(
  fn = probe,
  ids = rep(2244, 100000),
  scenario_sizes = c(10L, 1000L, 100000L),
  chunk_sizes = c(100L, 1000L),
  thresholds = thresholds,
  report_path = file.path(tempdir(), "pubchemr-benchmark.md"),
  report_format = "markdown"
)

h$summary

8) Analysis-layer helpers for assay modeling

8.1 Similarity retrieval and long assay activity table

sim <- pc_similarity_search(
  identifier = "CC(=O)OC1=CC=CC=C1C(=O)O",
  namespace = "smiles",
  threshold = 90,
  max_records = 200,
  cache = TRUE
)

sim_tbl <- as_tibble(sim) %>%
  filter(!is.na(CID)) %>%
  mutate(CID = as.character(CID)) %>%
  distinct(CID)

assay_long <- pc_assay_activity_long(
  identifier = sim_tbl$CID,
  namespace = "cid",
  chunk_size = 25,
  cache = TRUE
)

pc_assay_activity_long() now fails explicitly if any chunk fails, instead of silently dropping failed chunks.

8.2 Activity matrix (dense or sparse)

activity_dense <- pc_activity_matrix(
  assay_long,
  cid_col = "CID",
  aid_col = "AID",
  outcome_col = "ActivityOutcome",
  aggregate = "max",
  fill = NA_real_,
  output = "tibble"
)

activity_sparse <- pc_activity_matrix(
  assay_long,
  cid_col = "CID",
  aid_col = "AID",
  outcome_col = "ActivityOutcome",
  aggregate = "max",
  fill = NA_real_,
  output = "sparse"
)

activity_sparse

8.3 Feature table, matrix conversion, and export

feat <- pc_feature_table(
  identifier = unique(assay_long$CID),
  properties = c("MolecularWeight", "XLogP", "TPSA"),
  namespace = "cid",
  cache = TRUE
)

joined <- pc_cross_domain_join(
  compounds = feat,
  assays = assay_long %>% select(CID, AID, ActivityOutcome)
)

mm <- pc_model_matrix(
  x = feat,
  outcome = NULL,
  id_cols = c("CID"),
  na_fill = 0,
  scale = TRUE
)

pc_export_model_data(mm, path = file.path(tempdir(), "pubchemr_model.csv"), format = "csv")
pc_export_model_data(mm, path = file.path(tempdir(), "pubchemr_model.rds"), format = "rds")

9) Optional ecosystem bridges

These helpers require optional packages:

pc_to_rcdk() requires rcdk
pc_to_chemminer() requires ChemmineR

# Example assumes a table with CanonicalSMILES column:
mols <- pc_to_rcdk(feat, smiles_col = "CanonicalSMILES", id_col = "CID")
sdf  <- pc_to_chemminer(feat, smiles_col = "CanonicalSMILES")

10) Targeted PUG View helper for bio test sections

Besides generic get_pug_view(), get_biological_test_results() directly extracts the "Biological Test Results" section.

bio <- get_biological_test_results(
  identifier = "2244",
  domain = "compound",
  heading = "Biological Test Results",
  .all = TRUE
)

bio

11) Lifecycle policy (legacy vs nextgen)

pc_lifecycle_policy()
#> # A tibble: 2 × 5
#>   stream  stability   support_window deprecation_notice  breaking_change_window
#>   <chr>   <chr>       <chr>          <chr>               <chr>                 
#> 1 legacy  maintenance bugfix-only    >= 1 minor release  major release only    
#> 2 nextgen stable      minor+patch    >= 2 minor releases major release only

12) Recommended production pattern

For robust pipelines:

Set policy with pc_profile()/pc_config().
Enable cache and inspect with pc_cache_info().
Use typed wrappers (pc_property(), pc_identifier_map(), etc.).
Run heavy jobs through pc_batch() with checkpoints.
Use pc_resume_batch() for interruptions.
Gate performance with pc_benchmark_harness().
Export model-ready artifacts with pc_export_model_data().