This vignette demonstrates a reproducible pattern for working with PubChem data:
props <- get_properties(
properties = c("MolecularWeight", "MolecularFormula", "XLogP", "TPSA"),
identifier = c("aspirin", "ibuprofen", "caffeine"),
namespace = "name"
)
prop_tbl <- retrieve(props, .combine.all = TRUE, .to.data.frame = TRUE)
prop_tbl
#> # A tibble: 3 × 6
#> Identifier CID MolecularFormula MolecularWeight XLogP TPSA
#> <chr> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 aspirin 2244 C9H8O4 180.16 1.2 63.6
#> 2 ibuprofen 3672 C13H18O2 206.28 3.5 37.3
#> 3 caffeine 2519 C8H10N4O2 194.19 -0.1 58.4model_tbl <- prop_tbl %>%
mutate(
MolecularWeight = as.numeric(MolecularWeight),
XLogP = as.numeric(XLogP),
TPSA = as.numeric(TPSA)
)
model_tbl
#> # A tibble: 3 × 6
#> Identifier CID MolecularFormula MolecularWeight XLogP TPSA
#> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 aspirin 2244 C9H8O4 180. 1.2 63.6
#> 2 ibuprofen 3672 C13H18O2 206. 3.5 37.3
#> 3 caffeine 2519 C8H10N4O2 194. -0.1 58.4thresholds <- list(
elapsed_sec = c(`10` = 30, `1000` = 300, `100000` = 3600),
failed_chunk_ratio = c(`10` = 0, `1000` = 0.01, `100000` = 0.05)
)
probe <- function(ids) {
pc_request(
domain = "compound",
namespace = "cid",
identifier = 2244,
operation = "property/MolecularWeight",
output = "JSON",
cache = FALSE
)
}
bench <- pc_benchmark_harness(
fn = probe,
ids = rep(2244, 100000),
scenario_sizes = c(10, 1000, 100000),
chunk_sizes = 1000,
thresholds = thresholds,
report_path = file.path(tempdir(), "pubchemr-benchmark.md"),
report_format = "markdown"
)
bench$summaryThe nightly workflow live-pubchem-smoke.yml runs this
harness against live PubChem, publishes artifacts, and maintains
calibrated threshold recommendations from rolling history.
For high-throughput workflows, combine this pattern with deterministic identifier sets and saved intermediate outputs for reproducibility.