Setting up toxEval package data

toxEval includes ToxCast data to help simplify EAR calculations. Here are the instructions for getting the necessary columns from the ToxCast database using the tcpl package. This is NOT necessary for all toxEval users. However, if you are interested in extending the data or exploring additional fields, this might be useful. It is also a way to remember how to update the toxEval package when there are new ToxCast database updates.

First you would need to download the database and set up a MySQL server/database. This is beyond the scope of this vigentte right now.

Next, you need to connect to that local database:

library(tcpl)
library(toxEval)
library(tidyverse)


tcplConf(drvr = "MySQL", 
         user = "root",
         pass = "my_super_secret_password",
         host = "localhost", 
         db = "prod_internal_invitrodb_v4_1")

ToxCast_ACC

First, we’ll need to load several tables to get our necessary columns:

mc5_raw <- tcplQuery("SELECT   * FROM mc5")
mc4_raw <- tcplQuery("SELECT   * FROM mc4")
mc5_chid <- tcplQuery("SELECT   * FROM mc5_chid")
mc6_raw <- tcplQuery("SELECT   * FROM mc6")
sample_raw <- tcplQuery("SELECT   * FROM sample")
chemical_raw <- tcplQuery("SELECT   * FROM chemical")
mc5_param <- tcplQuery("SELECT   * FROM mc5_param")

Next, we’ll use dplyr to join:

ToxCast_ACC <- mc5_raw |> 
  filter(hitc >= 0.9) |> 
  select(-created_date, -modified_date, -modified_by) |> 
  left_join(mc4_raw |> 
              select(-created_date, -modified_date, -modified_by),
            by = join_by(m4id, aeid)) |> 
  left_join(mc5_chid, by = join_by(m5id)) |> 
  filter(chid_rep == 1) |> 
  left_join(mc6_raw |> 
              select(-created_date, -modified_date, -modified_by),
            by = join_by(m5id, m4id, aeid)) |> 
  left_join(sample_raw,
            by = join_by(spid)) |> 
  left_join(chemical_raw,
            by = join_by(chid)) |> 
  left_join(mc5_param |> 
              filter(hit_param == "acc"),
            by = join_by(m5id, aeid)) |> 
  select(casn, hit_val, aeid, mc6_mthd_id) |> 
  group_by(casn, hit_val, aeid) |> # 1-to-many flags
  summarise(flags = list(mc6_mthd_id)) |> 
  ungroup()

names(ToxCast_ACC)
[1] "casn"    "hit_val" "aeid"    "flags" 

end_point_info

Next, we’ll populate the end_point_info file:

assay_component_endpoint <- tcplQuery("SELECT * FROM assay_component_endpoint;")
assay <- tcplQuery("SELECT * FROM assay;")
assay_component <- tcplQuery("SELECT * FROM assay_component;")
assay_source <- tcplQuery("SELECT * FROM assay_source;")
gene <- tcplQuery("SELECT * FROM gene;")
intended_target <- tcplQuery("SELECT * FROM intended_target;")

end_point_info_41 <- assay_component_endpoint |> 
  left_join(assay_component, by = join_by(acid)) |> 
  left_join(assay, by = join_by(aid)) |> 
  left_join(assay_source, by = join_by(asid)) |> 
  left_join(intended_target, by = join_by(aeid)) |> 
  left_join(gene, by = c("target_id" = "gene_id")) |> 
  filter(!is.na(aeid)) |> 
  select(aeid, acid, assay_component_endpoint_name, 
         assay_component_endpoint_desc, intended_target_type, 
         intended_target_family_sub, intended_target_family,
         biological_process_target, tissue, 
         gene_symbol, assay_source_name) |> 
  group_by(across(c(-gene_symbol))) |> # 1-to-many genes
  summarise(gene_symbol = paste(gene_symbol, collapse = ", ")) |> 
  ungroup()

tox_chemicals

tox_chemicals41 <- mc5_raw |> 
  select(-created_date, -modified_date, -modified_by) |> 
  left_join(mc4_raw |> 
              select(-created_date, -modified_date, -modified_by),
            by = join_by(m4id, aeid)) |> 
  left_join(mc5_chid, by = join_by(m5id)) |> 
  filter(chid_rep == 1) |> 
  left_join(mc6_raw |> 
              select(-created_date, -modified_date, -modified_by),
            by = join_by(m5id, m4id, aeid)) |> 
  left_join(sample_raw,
            by = join_by(spid)) |> 
  left_join(chemical_raw,
            by = join_by(chid)) |> 
  left_join(mc5_param |> 
              filter(hit_param == "acc"),
            by = join_by(m5id, aeid)) |> 
  group_by(casn, chnm, dsstox_substance_id) |> 
  summarise(Total_tested = length(unique(aeid)),
            Active = length(unique(aeid[hitc >= 0.9]))) |> 
  ungroup() |> 
  left_join(tox_chemicals_35 |> 
              select(casn = Substance_CASRN,
                     Structure_MolWt),
            by = "casn")

We need molecular weights to convert the ACC values to concentrations. We can either join previous versions of the toxEval package, and/or get new values from the CompTox Dashboard via the batch search:

https://comptox.epa.gov/dashboard/batch-search\

To create a list of chemicals to input to the dashboard:

need_mlwt <- tox_chemicals41$dsstox_substance_id[is.na(tox_chemicals41$Structure_MolWt)]
#Bring this to Comptox:
data.table::fwrite(data.frame(need_mlwt), "need_mlwt.csv")  
# Import the file from Comptox:
more_mlwts <- data.table::fread("CCD-Batch-Search.csv",
                                data.table = FALSE) |> 
  mutate(new_Structure_MolWt = as.numeric(AVERAGE_MASS)) |> 
  select(casn = CASRN,
         dsstox_substance_id = DTXSID,
         new_Structure_MolWt)

tox_chemicals41 <- tox_chemicals41 |> 
  left_join(more_mlwts, 
              by = c("casn", "dsstox_substance_id")) |> 
  mutate(Structure_MolWt = if_else(is.na(Structure_MolWt),
                                        new_Structure_MolWt,
                                        Structure_MolWt)) |> 
  select(-new_Structure_MolWt)

Flags

A small table to describe ToxCast flags is also included:

flags <- mc6_raw |> 
  select(flag_id = mc6_mthd_id,
         flag_full = flag) |> 
  distinct()

sysdata.rda

Finally we can save that data in the package:

ToxCast_ACC <- ToxCast_ACC_41
tox_chemicals <- tox_chemicals41
end_point_info <- end_point_info_41_rel

save(ToxCast_ACC, tox_chemicals, end_point_info, flags,
     file = "sysdata.rda", compress = "xz")