SPsimSeq

There is no individual estimation step using SPsimSeq as the estimation is combined with simulation step.

library(simmethods)
library(SingleCellExperiment)
# Load data
ref_data <- simmethods::data

Simulating datasets using SPsimSeq

  1. Datasets with default parameters
  2. Determin the number of cells and genes
  3. Simulate groups
  4. Simulate batches

Datasets with default parameters

The reference data contains 160 cells and 4000 genes, if we simulate datasets with default parameters and then we will obtain a new data which has the same size as the reference data.

simulate_result <- simmethods::SPsimSeq_simulation(
  ref_data = ref_data,
  other_prior = NULL,
  return_format = "SCE",
  seed = 111
)
# 
# nCells: 160
# nGenes: 4000
# nGroups: 1
# de.prob: 0.1
# fc.group: 2
# nBatches: 1
# Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
SCE_result <- simulate_result[["simulate_result"]]
dim(SCE_result)
# [1] 4000  160
head(colData(SCE_result))
# DataFrame with 6 rows and 3 columns
#         cell_name       group       batch
#       <character> <character> <character>
# Cell1       Cell1      Group1      Batch1
# Cell2       Cell2      Group1      Batch1
# Cell3       Cell3      Group1      Batch1
# Cell4       Cell4      Group1      Batch1
# Cell5       Cell5      Group1      Batch1
# Cell6       Cell6      Group1      Batch1
head(rowData(SCE_result))
# DataFrame with 6 rows and 1 column
#         gene_name
#       <character>
# Gene1       Gene1
# Gene2       Gene2
# Gene3       Gene3
# Gene4       Gene4
# Gene5       Gene5
# Gene6       Gene6

Determin the number of cells and genes

simulate_result <- simmethods::SPsimSeq_simulation(
  ref_data = ref_data,
  other_prior = list(nCells = 500,
                     nGenes = 1000),
  return_format = "SCE",
  seed = 111
)
# nCells: 500
# nGenes: 1000
# nGroups: 1
# de.prob: 0.1
# fc.group: 2
# nBatches: 1
# Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
SCE_result <- simulate_result[["simulate_result"]]
dim(SCE_result)
# [1] 1000  500

Simulate groups

The number of groups simulated by SPsimSeq is determined by the group information used in simulation step. If the group information of two cell states is provided, then the simulated dataset will contain two groups. SPsimSeq also provides other parameters related to DEGs such as the proportion of DEGs (de.prob) and the fold change of DGEs (fc.group).

For demonstration, we will simulate two groups using the learned parameters. We can set de.prob = 0.2 to simulate 20% genes as DEGs.

group_condition <- as.numeric(simmethods::group_condition)
simulate_result <- simmethods::SPsimSeq_simulation(
  ref_data = ref_data,
  other_prior = list(group.condition = group_condition,
                     de.prob = 0.2,
                     fc.group = 4),
  return_format = "list",
  seed = 111
)
# nCells: 160
# nGenes: 4000
# nGroups: 2
# de.prob: 0.2
# fc.group: 4
# nBatches: 1
# Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
# Note: The number of DE genes detected in the source data is 5 and the number of DE genes required to be included in the simulated data is 800. Therefore, candidiate DE genes are sampled with replacement.
col_data <- simulate_result[["simulate_result"]][["col_meta"]]
table(col_data$group)
# 
# Group1 Group2 
#     80     80
## gene information
gene_info <- simulate_result[["simulate_result"]][["row_meta"]]
### the proportion of DEGs
table(gene_info$de_gene)[2]/nrow(gene_info) ## de.prob = 0.2
# yes 
# 0.2

Simulate batches

Users can simulate batches when batch.condition parameter is activated and just input the numeric vectors that specify the batch labels of cells.

set.seed(111)
ref_data <- scater::mockSCE(ncells = 160,
                            ngenes = 4000)
ref_data <- SingleCellExperiment::counts(ref_data)

For demonstration, we will simulate two batches.

simulate_result <- simmethods::SPsimSeq_simulation(
  ref_data = ref_data,
  other_prior = list(batch.condition = sample(1:3, 160, replace = TRUE)),
  return_format = "list",
  seed = 111
)
# nCells: 160
# nGenes: 4000
# nGroups: 1
# de.prob: 0.1
# fc.group: 2
# nBatches: 3
## cell information
col_data <- simulate_result[["simulate_result"]][["col_meta"]]
table(col_data$batch)
# 
# Batch1 Batch2 Batch3 
#     50     58     52