iscream compatible data structures

The examples here show how iscream output can be converted into other data structures for further analysis.

library(iscream)

## iscream using 1 thread by default but parallelly::availableCores() detects 4 possibly available threads. See `?set_threads` for information on multithreading before trying to use more.

data_dir <- system.file("extdata", package = "iscream")
bedfiles <- list.files(data_dir, pattern = "[a|b|c|d].bed.gz$", full.names = TRUE)
regions <- c(A = "chr1:1-6", B = "chr1:7-10", C = "chr1:11-14")

GenomicRanges

library(GenomicRanges)

GRanges objects can be used as the input regions to all of iscream’s functions and can be returned by tabix_gr() and make_mat_gr().

From `tabix` queries

tabix_gr() returns a GenomicRanges object. The regions parameter can be a string vector, a data frame or a GRanges object. If given a GRanges object with metadata, those columns will be preserved in the output.

gr <- GRanges(regions)
values(gr) <- DataFrame(
    gene = c("gene1", "gene2", "gene3"),
    some_metadata = c("s1", "s2", "s3")
)
gr

## GRanges object with 3 ranges and 2 metadata columns:
##     seqnames    ranges strand |        gene some_metadata
##        <Rle> <IRanges>  <Rle> | <character>   <character>
##   A     chr1       1-6      * |       gene1            s1
##   B     chr1      7-10      * |       gene2            s2
##   C     chr1     11-14      * |       gene3            s3
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

tabix_gr(bedfiles[1], gr)

## GRanges object with 7 ranges and 4 metadata columns:
##       seqnames    ranges strand |        V1        V2        gene some_metadata
##          <Rle> <IRanges>  <Rle> | <numeric> <integer> <character>   <character>
##   [1]     chr1       1-2      * |       1.0         1       gene1            s1
##   [2]     chr1       3-4      * |       1.0         1       gene1            s1
##   [3]     chr1       5-6      * |       0.0         2       gene1            s1
##   [4]     chr1       7-8      * |       0.0         1       gene2            s2
##   [5]     chr1      9-10      * |       0.5         2       gene2            s2
##   [6]     chr1     11-12      * |       1.0         2       gene3            s3
##   [7]     chr1     13-14      * |       1.0         3       gene3            s3
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

The data.table output of tabix() can also be piped into GRanges, but does not preserve input metadata.

tabix(bedfiles[1], gr) |>
  makeGRangesFromDataFrame(
    starts.in.df.are.0based = TRUE,
    keep.extra.columns = TRUE
  )

## GRanges object with 7 ranges and 2 metadata columns:
##       seqnames    ranges strand |        V1        V2
##          <Rle> <IRanges>  <Rle> | <numeric> <integer>
##   [1]     chr1       1-2      * |       1.0         1
##   [2]     chr1       3-4      * |       1.0         1
##   [3]     chr1       5-6      * |       0.0         2
##   [4]     chr1       7-8      * |       0.0         1
##   [5]     chr1      9-10      * |       0.5         2
##   [6]     chr1     11-12      * |       1.0         2
##   [7]     chr1     13-14      * |       1.0         3
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

If the input BED file is not zero-based (e.g. Bismark coverage files), set zero_based = FALSE in the tabix() call to get the correct conversion from data frame to GenomicRanges.

From `summarize_regions()`

summarize_regions() returns a data frame with a feature column identifying each summary row’s genomic region.

If the region features are not named (see ?summarize_regions), pass the feature column with the genomic regions as the GRanges input regions. Here, since the regions vector is named, using unname will cause the feature column to populate with regions strings:

(summary <- summarize_meth_regions(
  bedfiles,
  unname(regions),
  fun = c("sum", "mean"))
)

## [13:21:28.562685] [iscream::summarize_regions] [info] Summarizing 3 regions from 4 bedfiles
## [13:21:28.562715] [iscream::summarize_regions] [info] using sum, mean
## [13:21:28.562719] [iscream::summarize_regions] [info] with columns 4, 5 as coverage, M

##       feature file coverage.sum M.sum coverage.mean    M.mean
## 1    chr1:1-6    a            4     2      1.333333 0.6666667
## 2   chr1:7-10    a            3     1      1.500000 0.5000000
## 3  chr1:11-14    a            5     5      2.500000 2.5000000
## 4    chr1:1-6    b            4     2      2.000000 1.0000000
## 5   chr1:7-10    b            1     1      1.000000 1.0000000
## 6  chr1:11-14    b            3     1      1.500000 0.5000000
## 7    chr1:1-6    c            2     2      2.000000 2.0000000
## 8   chr1:7-10    c            3     1      1.500000 0.5000000
## 9  chr1:11-14    c           NA    NA            NA        NA
## 10   chr1:1-6    d            3     3      1.500000 1.5000000
## 11  chr1:7-10    d            3     1      1.500000 0.5000000
## 12 chr1:11-14    d            1     1      1.000000 1.0000000

GRanges(summary$feature, summary = summary[, -1])

## GRanges object with 12 ranges and 5 metadata columns:
##        seqnames    ranges strand | summary.file summary.coverage.sum
##           <Rle> <IRanges>  <Rle> |  <character>            <numeric>
##    [1]     chr1       1-6      * |            a                    4
##    [2]     chr1      7-10      * |            a                    3
##    [3]     chr1     11-14      * |            a                    5
##    [4]     chr1       1-6      * |            b                    4
##    [5]     chr1      7-10      * |            b                    1
##    ...      ...       ...    ... .          ...                  ...
##    [8]     chr1      7-10      * |            c                    3
##    [9]     chr1     11-14      * |            c                   NA
##   [10]     chr1       1-6      * |            d                    3
##   [11]     chr1      7-10      * |            d                    3
##   [12]     chr1     11-14      * |            d                    1
##        summary.M.sum summary.coverage.mean summary.M.mean
##            <numeric>             <numeric>      <numeric>
##    [1]             2               1.33333       0.666667
##    [2]             1               1.50000       0.500000
##    [3]             5               2.50000       2.500000
##    [4]             2               2.00000       1.000000
##    [5]             1               1.00000       1.000000
##    ...           ...                   ...            ...
##    [8]             1                   1.5            0.5
##    [9]            NA                    NA             NA
##   [10]             3                   1.5            1.5
##   [11]             1                   1.5            0.5
##   [12]             1                   1.0            1.0
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

If the input regions are named use set_region_rownames = TRUE so that the genomic regions strings are preserved and use them as the GRanges input regions.

(summary <- summarize_regions(
  bedfiles,
  regions,
  column = 4,
  set_region_rownames = TRUE,
  fun = c("sum", "mean"))
)

## [13:21:28.679647] [iscream::summarize_regions] [info] Summarizing 3 regions from 4 bedfiles
## [13:21:28.679670] [iscream::summarize_regions] [info] using sum, mean
## [13:21:28.679674] [iscream::summarize_regions] [info] with columns 4 as V1

##            feature file V1.sum   V1.mean
## chr1:1-6         A    a    2.0 0.6666667
## chr1:7-10        B    a    0.5 0.2500000
## chr1:11-14       C    a    2.0 1.0000000
## chr1:1-6         A    b    1.0 0.5000000
## chr1:7-10        B    b    1.0 1.0000000
## chr1:11-14       C    b    1.0 0.5000000
## chr1:1-6         A    c    1.0 1.0000000
## chr1:7-10        B    c    1.0 0.5000000
## chr1:11-14       C    c     NA        NA
## chr1:1-6         A    d    2.0 1.0000000
## chr1:7-10        B    d    0.5 0.2500000
## chr1:11-14       C    d    1.0 1.0000000

GRanges(rownames(summary), summary = summary)

## GRanges object with 12 ranges and 4 metadata columns:
##        seqnames    ranges strand | summary.feature summary.file summary.V1.sum
##           <Rle> <IRanges>  <Rle> |     <character>  <character>      <numeric>
##    [1]     chr1       1-6      * |               A            a            2.0
##    [2]     chr1      7-10      * |               B            a            0.5
##    [3]     chr1     11-14      * |               C            a            2.0
##    [4]     chr1       1-6      * |               A            b            1.0
##    [5]     chr1      7-10      * |               B            b            1.0
##    ...      ...       ...    ... .             ...          ...            ...
##    [8]     chr1      7-10      * |               B            c            1.0
##    [9]     chr1     11-14      * |               C            c             NA
##   [10]     chr1       1-6      * |               A            d            2.0
##   [11]     chr1      7-10      * |               B            d            0.5
##   [12]     chr1     11-14      * |               C            d            1.0
##        summary.V1.mean
##              <numeric>
##    [1]        0.666667
##    [2]        0.250000
##    [3]        1.000000
##    [4]        0.500000
##    [5]        1.000000
##    ...             ...
##    [8]            0.50
##    [9]              NA
##   [10]            1.00
##   [11]            0.25
##   [12]            1.00
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

From `make_mat`

make_mat_gr() returns a GRanges object for dense matrices.

make_mat_gr(bedfiles, regions, column = 4, mat_name = "beta")

## [13:21:28.793173] [iscream::query_all] [info] Querying 3 regions from 4 bedfiles
## 
## [13:21:28.793508] [iscream::query_all] [info] Creating metadata vectors
## [13:21:28.793546] [iscream::query_all] [info] 7 loci found - 9993 extra rows allocated with 0 resizes
## [13:21:28.793549] [iscream::query_all] [info] Creating dense matrix

## GRanges object with 7 ranges and 4 metadata columns:
##       seqnames    ranges strand |         a         b         c         d
##          <Rle> <IRanges>  <Rle> | <numeric> <numeric> <numeric> <numeric>
##   [1]     chr1         0      * |       1.0         0         0       1.0
##   [2]     chr1         2      * |       1.0         0         1       1.0
##   [3]     chr1         4      * |       0.0         1         0       0.0
##   [4]     chr1         6      * |       0.0         1         0       0.0
##   [5]     chr1         8      * |       0.5         0         1       0.5
##   [6]     chr1        10      * |       1.0         0         0       0.0
##   [7]     chr1        12      * |       1.0         1         0       1.0
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

SummarizedExperiment

make_mat_se() returns a RangedSummarizedExperiment for both sparse and dense matrices.

make_mat_se(bedfiles, regions, column = 4, mat_name = "beta", sparse = TRUE)

## [13:21:30.945429] [iscream::query_all] [info] Querying 3 regions from 4 bedfiles
## 
## [13:21:30.945813] [iscream::query_all] [info] Creating metadata vectors
## [13:21:30.945836] [iscream::query_all] [info] 7 loci found - 9993 extra rows allocated with 0 resizes
## [13:21:30.945843] [iscream::query_all] [info] Creating sparse matrix

## class: RangedSummarizedExperiment 
## dim: 7 4 
## metadata(0):
## assays(1): beta
## rownames: NULL
## rowData names(0):
## colnames(4): a b c d
## colData names(0):

Making `BSseq` objects

A bsseq object is a type of SummarizedExperiment, but it cannot handle sparse matrices:

library(bsseq)
do.call(BSseq, mats)

Session info

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] GenomicRanges_1.60.0 GenomeInfoDb_1.44.1  IRanges_2.42.0      
## [4] S4Vectors_0.46.0     BiocGenerics_0.54.0  generics_0.1.4      
## [7] iscream_0.99.1       BiocStyle_2.36.0    
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10                 SparseArray_1.8.1          
##  [3] lattice_0.22-7              digest_0.6.37              
##  [5] evaluate_1.0.4              grid_4.5.1                 
##  [7] bookdown_0.43               fastmap_1.2.0              
##  [9] jsonlite_2.0.0              Matrix_1.7-3               
## [11] BiocManager_1.30.26         httr_1.4.7                 
## [13] UCSC.utils_1.4.0            textshaping_1.0.1          
## [15] jquerylib_0.1.4             abind_1.4-8                
## [17] cli_3.6.5                   rlang_1.1.6                
## [19] crayon_1.5.3                XVector_0.48.0             
## [21] Biobase_2.68.0              parallelly_1.45.1          
## [23] DelayedArray_0.34.1         cachem_1.1.0               
## [25] yaml_2.3.10                 S4Arrays_1.8.1             
## [27] tools_4.5.1                 parallel_4.5.1             
## [29] GenomeInfoDbData_1.2.14     SummarizedExperiment_1.38.1
## [31] R6_2.6.1                    matrixStats_1.5.0          
## [33] lifecycle_1.0.4             fs_1.6.6                   
## [35] stringfish_0.17.0           ragg_1.4.0                 
## [37] desc_1.4.3                  pkgdown_2.1.3              
## [39] RcppParallel_5.1.10         bslib_0.9.0                
## [41] data.table_1.17.8           Rcpp_1.1.0                 
## [43] systemfonts_1.2.3           xfun_0.52                  
## [45] MatrixGenerics_1.20.0       knitr_1.50                 
## [47] htmltools_0.5.8.1           rmarkdown_2.29             
## [49] compiler_4.5.1

GenomicRanges

From tabix queries

From summarize_regions()