This vignette covers advanced
features that are not the focus of the legacy get_*/PUG
REST/PUG View tutorials. It focuses on resilient, typed, and
high-throughput pc_* workflows:
library(PubChemR)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tibble)The pc_* layer returns structured objects
(PubChemResult, PubChemBatchResult,
PubChemBenchmarkReport) with explicit
success/error fields.
# Deterministic local payload -> typed successful result:
ok_text <- '{"PropertyTable":{"Properties":[{"CID":2244,"MolecularWeight":180.16}]}}'
res <- pc_response(
ok_text,
request = list(
domain = "compound",
namespace = "cid",
identifier = 2244,
operation = "property/MolecularWeight",
output = "JSON"
)
)
res$success
#> [1] TRUE
as_tibble(res)
#> # A tibble: 1 × 6
#> success status from_cache pending CID MolecularWeight
#> <lgl> <int> <lgl> <lgl> <dbl> <dbl>
#> 1 TRUE NA FALSE FALSE 2244 180.You can also normalize raw payload text into the same typed structure:
Use pc_profile() for common policies, then override with
pc_config().
pc_profile("default")
#> $rate_limit
#> [1] 5
#>
#> $timeout
#> [1] 60
#>
#> $retries
#> [1] 3
#>
#> $pause_base
#> [1] 1
#>
#> $pause_cap
#> [1] 8
#>
#> $user_agent
#> [1] "PubChemR/3.0.0"
#>
#> $cache_dir
#> [1] "/tmp/RtmpzUrX9P/PubChemR_cache"
#>
#> $cache_ttl
#> [1] 86400
#>
#> $offline
#> [1] FALSE
cfg <- pc_config(
rate_limit = 5,
timeout = 60,
retries = 3,
pause_base = 1,
pause_cap = 8,
cache_ttl = 24 * 60 * 60
)
cfg
#> $rate_limit
#> [1] 5
#>
#> $timeout
#> [1] 60
#>
#> $retries
#> [1] 3
#>
#> $pause_base
#> [1] 1
#>
#> $pause_cap
#> [1] 8
#>
#> $user_agent
#> [1] "PubChemR/3.0.0"
#>
#> $cache_dir
#> [1] "/tmp/RtmpzUrX9P/PubChemR_cache"
#>
#> $cache_ttl
#> [1] 86400
#>
#> $offline
#> [1] FALSEInput validation is strict for chunk sizes, worker counts, and numeric controls.
pc_request() is the generic entry point. Wrappers
provide typed intent.
# Generic request
r0 <- pc_request(
domain = "compound",
namespace = "cid",
identifier = 2244,
operation = "property/MolecularWeight,XLogP",
output = "JSON"
)
# Domain-oriented wrappers
cmp <- pc_compound(identifier = 2244, namespace = "cid", operation = "record")
sub <- pc_substance(identifier = 2244, namespace = "cid", operation = "record")
asy <- pc_assay(identifier = 1000, namespace = "aid", operation = "summary")
prp <- pc_property(identifier = c(2244, 3672), properties = c("MolecularWeight", "XLogP"))
idm <- pc_identifier_map(identifier = "aspirin", namespace = "name", to = "cids")
as_tibble(prp)
as_tibble(idm)PubChemR supports memory+disk caching, diagnostics, and cache-only replay mode.
pc_cache_clear()
first <- pc_property(
identifier = 2244,
properties = c("MolecularWeight", "XLogP"),
namespace = "cid",
cache = TRUE
)
second <- pc_property(
identifier = 2244,
properties = c("MolecularWeight", "XLogP"),
namespace = "cid",
cache = TRUE
)
second$from_cache
pc_cache_info()Offline replay mode is useful for deterministic pipelines:
pc_submit ->
pc_poll -> pc_collect)For asynchronous queries, use:
q <- pc_submit(
domain = "compound",
namespace = "name",
identifier = "aspirin",
searchtype = "similarity",
operation = "cids",
output = "JSON",
options = list(Threshold = 95, MaxRecords = 200)
)
# Option A: explicit polling
final_a <- pc_poll(q, interval = 1.5, max_attempts = 20)
# Option B: convenience collector
final_b <- pc_collect(q, interval = 1.5, max_attempts = 20)
as_tibble(final_b)pc_batch() supports chunking, optional parallelism, and
resumable checkpoints.
ids <- c("aspirin", "ibuprofen", "caffeine", "acetaminophen", "naproxen")
cp_dir <- file.path(tempdir(), "pc_batch_checkpoint")
batch <- pc_batch(
ids = ids,
fn = function(chunk_ids) {
pc_identifier_map(
identifier = chunk_ids,
namespace = "name",
to = "cids",
domain = "compound",
cache = TRUE
)
},
chunk_size = 2,
checkpoint_dir = cp_dir,
checkpoint_id = "name_to_cid_demo"
)
batch
as_tibble(batch)Resume from existing checkpoint state:
pc_benchmark() compares chunk and parallel
scenarios.
pc_benchmark_harness() scales this into scenario gates
(10/1k/100k by default).
probe <- function(ids) {
pc_request(
domain = "compound",
namespace = "cid",
identifier = ids,
operation = "property/MolecularWeight",
output = "JSON",
cache = FALSE
)
}
bm <- pc_benchmark(
ids = rep(2244, 200),
fn = probe,
chunk_sizes = c(25, 50, 100),
parallel_options = c(FALSE)
)
bmthresholds <- list(
elapsed_sec = c(`10` = 30, `1000` = 300, `100000` = 3600),
failed_chunk_ratio = c(`10` = 0, `1000` = 0.01, `100000` = 0.05)
)
h <- pc_benchmark_harness(
fn = probe,
ids = rep(2244, 100000),
scenario_sizes = c(10L, 1000L, 100000L),
chunk_sizes = c(100L, 1000L),
thresholds = thresholds,
report_path = file.path(tempdir(), "pubchemr-benchmark.md"),
report_format = "markdown"
)
h$summarysim <- pc_similarity_search(
identifier = "CC(=O)OC1=CC=CC=C1C(=O)O",
namespace = "smiles",
threshold = 90,
max_records = 200,
cache = TRUE
)
sim_tbl <- as_tibble(sim) %>%
filter(!is.na(CID)) %>%
mutate(CID = as.character(CID)) %>%
distinct(CID)
assay_long <- pc_assay_activity_long(
identifier = sim_tbl$CID,
namespace = "cid",
chunk_size = 25,
cache = TRUE
)pc_assay_activity_long() now fails explicitly if any
chunk fails, instead of silently dropping failed chunks.
activity_dense <- pc_activity_matrix(
assay_long,
cid_col = "CID",
aid_col = "AID",
outcome_col = "ActivityOutcome",
aggregate = "max",
fill = NA_real_,
output = "tibble"
)
activity_sparse <- pc_activity_matrix(
assay_long,
cid_col = "CID",
aid_col = "AID",
outcome_col = "ActivityOutcome",
aggregate = "max",
fill = NA_real_,
output = "sparse"
)
activity_sparsefeat <- pc_feature_table(
identifier = unique(assay_long$CID),
properties = c("MolecularWeight", "XLogP", "TPSA"),
namespace = "cid",
cache = TRUE
)
joined <- pc_cross_domain_join(
compounds = feat,
assays = assay_long %>% select(CID, AID, ActivityOutcome)
)
mm <- pc_model_matrix(
x = feat,
outcome = NULL,
id_cols = c("CID"),
na_fill = 0,
scale = TRUE
)
pc_export_model_data(mm, path = file.path(tempdir(), "pubchemr_model.csv"), format = "csv")
pc_export_model_data(mm, path = file.path(tempdir(), "pubchemr_model.rds"), format = "rds")These helpers require optional packages:
pc_to_rcdk() requires rcdkpc_to_chemminer() requires ChemmineRBesides generic get_pug_view(),
get_biological_test_results() directly extracts the
"Biological Test Results" section.
For robust pipelines:
pc_profile()/pc_config().pc_cache_info().pc_property(),
pc_identifier_map(), etc.).pc_batch() with
checkpoints.pc_resume_batch() for interruptions.pc_benchmark_harness().pc_export_model_data().pc_submit ->
pc_poll -> pc_collect)