perf: "two-pass" seurat hvg via `scanpy.get.aggregate` by ilan-gold · Pull Request #4013 · scverse/scanpy

ilan-gold · 2026-03-26T13:49:28Z

An idea that popped into my head for disk-bound datasets but likely also normal ones. This should, in theory, greatly improve on-disk access and produce speed ups for disk bound data by reducing the amount of i/o in the worst case, unordered scenario (while, I would guess, leaving in-memory datasets untocuhed or maybe improved thanks to memory access + more efficient mean/var).

Closes #
Tests included or not required because:

Release notes not necessary because:

codecov · 2026-03-26T14:11:05Z

❌ 1 Tests Failed:

Tests completed	Failed	Passed	Skipped
2399	1	2398	339

View the top 1 failed test(s) by shortest run time

tests/test_highly_variable_genes.py::test_compare_to_seurat_v3

Stack Traces | 733s run time

#x1B[0m#x1B[37m@needs#x1B[39;49;00m.skmisc#x1B[90m#x1B[39;49;00m
    #x1B[94mdef#x1B[39;49;00m#x1B[90m #x1B[39;49;00m#x1B[92mtest_compare_to_seurat_v3#x1B[39;49;00m():#x1B[90m#x1B[39;49;00m
        #x1B[90m### test without batch#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
        seurat_hvg_info = pd.read_csv(FILE_V3)#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        pbmc = pbmc3k()#x1B[90m#x1B[39;49;00m
        sc.pp.filter_cells(pbmc, min_genes=#x1B[94m200#x1B[39;49;00m)  #x1B[90m# this doesnt do anything btw#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
        sc.pp.filter_genes(pbmc, min_cells=#x1B[94m3#x1B[39;49;00m)#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        pbmc_dense = pbmc.copy()#x1B[90m#x1B[39;49;00m
        pbmc_dense.X = pbmc_dense.X.toarray()#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        sc.pp.highly_variable_genes(pbmc, n_top_genes=#x1B[94m1000#x1B[39;49;00m, flavor=#x1B[33m"#x1B[39;49;00m#x1B[33mseurat_v3#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m)#x1B[90m#x1B[39;49;00m
        sc.pp.highly_variable_genes(pbmc_dense, n_top_genes=#x1B[94m1000#x1B[39;49;00m, flavor=#x1B[33m"#x1B[39;49;00m#x1B[33mseurat_v3#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m)#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        np.testing.assert_allclose(#x1B[90m#x1B[39;49;00m
            seurat_hvg_info[#x1B[33m"#x1B[39;49;00m#x1B[33mvariance#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            pbmc.var[#x1B[33m"#x1B[39;49;00m#x1B[33mvariances#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            rtol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            atol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
        np.testing.assert_allclose(#x1B[90m#x1B[39;49;00m
            seurat_hvg_info[#x1B[33m"#x1B[39;49;00m#x1B[33mvariance.standardized#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            pbmc.var[#x1B[33m"#x1B[39;49;00m#x1B[33mvariances_norm#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            rtol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            atol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
        np.testing.assert_allclose(#x1B[90m#x1B[39;49;00m
            pbmc_dense.var[#x1B[33m"#x1B[39;49;00m#x1B[33mvariances_norm#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            pbmc.var[#x1B[33m"#x1B[39;49;00m#x1B[33mvariances_norm#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            rtol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            atol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        #x1B[90m### test with batch#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
        #x1B[90m# introduce a dummy "technical covariate"; this is used in Seurat's SelectIntegrationFeatures#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
        pbmc.obs[#x1B[33m"#x1B[39;49;00m#x1B[33mdummy_tech#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m] = (#x1B[90m#x1B[39;49;00m
            #x1B[33m"#x1B[39;49;00m#x1B[33msource_#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m + pd.array([*#x1B[96mrange#x1B[39;49;00m(#x1B[94m1#x1B[39;49;00m, #x1B[94m6#x1B[39;49;00m), #x1B[94m5#x1B[39;49;00m]).repeat(#x1B[94m500#x1B[39;49;00m).astype(#x1B[33m"#x1B[39;49;00m#x1B[33mstring#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m)#x1B[90m#x1B[39;49;00m
        )[: pbmc.n_obs]#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        seurat_v3_paper = sc.pp.highly_variable_genes(#x1B[90m#x1B[39;49;00m
            pbmc,#x1B[90m#x1B[39;49;00m
            n_top_genes=#x1B[94m2000#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            flavor=#x1B[33m"#x1B[39;49;00m#x1B[33mseurat_v3_paper#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            batch_key=#x1B[33m"#x1B[39;49;00m#x1B[33mdummy_tech#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            inplace=#x1B[94mFalse#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        seurat_v3 = sc.pp.highly_variable_genes(#x1B[90m#x1B[39;49;00m
            pbmc,#x1B[90m#x1B[39;49;00m
            n_top_genes=#x1B[94m2000#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            flavor=#x1B[33m"#x1B[39;49;00m#x1B[33mseurat_v3#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            batch_key=#x1B[33m"#x1B[39;49;00m#x1B[33mdummy_tech#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            inplace=#x1B[94mFalse#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        seurat_hvg_info_batch = pd.read_csv(FILE_V3_BATCH)#x1B[90m#x1B[39;49;00m
        seu = pd.Index(seurat_hvg_info_batch[#x1B[33m"#x1B[39;49;00m#x1B[33mx#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m].to_numpy())#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        gene_intersection_paper = seu.intersection(#x1B[90m#x1B[39;49;00m
            seurat_v3_paper[seurat_v3_paper[#x1B[33m"#x1B[39;49;00m#x1B[33mhighly_variable#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m]].index#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
        gene_intersection_impl = seu.intersection(#x1B[90m#x1B[39;49;00m
            seurat_v3[seurat_v3[#x1B[33m"#x1B[39;49;00m#x1B[33mhighly_variable#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m]].index#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
>       #x1B[94massert#x1B[39;49;00m #x1B[96mlen#x1B[39;49;00m(gene_intersection_paper) / #x1B[94m2000#x1B[39;49;00m > #x1B[94m0.95#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
#x1B[1m#x1B[31mE       AssertionError: assert (1867 / 2000) > 0.95#x1B[0m
#x1B[1m#x1B[31mE        +  where 1867 = len(Index(['LYZ', 'GNLY', 'S100A9', 'FTL', 'FTH1', 'S100A8', 'HLA-DRA', 'CST3',\n       'CD74', 'NKG7',\n       ...\n       'CR1', 'SLC20A2', 'HBEGF', 'IL15', 'DNAJC9', 'LRRC59', 'ALG12', 'DPP4',\n       'CLUAP1', 'HCFC2'],\n      dtype='object', length=1867))#x1B[0m

#x1B[1m#x1B[31mtests/test_highly_variable_genes.py#x1B[0m:492: AssertionError

To view more test analytics, go to the Test Analytics Dashboard
_{📋 Got 3 mins? Take this short survey to help us improve Test Analytics.}

scverse-benchmark · 2026-03-26T14:29:37Z

Benchmark changes

Change	Before [`b5efe54`]	After [`fdc5653`]	Ratio	Benchmark (Parameter)
-	450±2μs	388±3μs	0.86	preprocessing_counts.FastSuite.time_log1p('pbmc68k_reduced', 'counts')
!	8.57G	failed	n/a	preprocessing_log.HVGSuite.peakmem_highly_variable_genes('seurat_v3')
!	1.97±0s	failed	n/a	preprocessing_log.HVGSuite.time_highly_variable_genes('seurat_v3')

Warning

Some benchmarks failed

Comparison: https://github.com/scverse/scanpy/compare/b5efe54e062ceb3653f4a5732f19f227057bb916..fdc5653bb78715b8ba085a43c51ef102e9e66a4f
Last changed: Thu, 26 Mar 2026 17:14:34 +0000

More details: https://github.com/scverse/scanpy/pull/4013/checks?check_run_id=68723399697

ilan-gold added 2 commits March 26, 2026 14:44

perf: "two-pass" seurat hvg3 via scanpy.get.aggregate

a625c55

chore: hvg v3 benchmark

d839e98

ilan-gold added this to the 1.12.1 milestone Mar 26, 2026

ilan-gold added the benchmark label Mar 26, 2026

ilan-gold changed the title ~~perf: "two-pass" seurat hvg3 via scanpy.get.aggregate~~ perf: "two-pass" seurat hvg via scanpy.get.aggregate Mar 26, 2026

ilan-gold added 2 commits March 26, 2026 14:56

fix: use counts

86db499

fix: use a batch key

d5a6a78

fix: not again

fdc5653

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

perf: "two-pass" seurat hvg via `scanpy.get.aggregate`#4013

perf: "two-pass" seurat hvg via `scanpy.get.aggregate`#4013
ilan-gold wants to merge 5 commits intomainfrom
ig/two_pass_hvg_v3

ilan-gold commented Mar 26, 2026

Uh oh!

codecov bot commented Mar 26, 2026 •

edited

Loading

Uh oh!

scverse-benchmark bot commented Mar 26, 2026 •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

Conversation

ilan-gold commented Mar 26, 2026

Uh oh!

codecov bot commented Mar 26, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

❌ 1 Tests Failed:

Uh oh!

scverse-benchmark bot commented Mar 26, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Benchmark changes

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

codecov bot commented Mar 26, 2026 •

edited

Loading

scverse-benchmark bot commented Mar 26, 2026 •

edited

Loading