Skip to content

perf: "two-pass" seurat hvg via scanpy.get.aggregate#4013

Draft
ilan-gold wants to merge 5 commits intomainfrom
ig/two_pass_hvg_v3
Draft

perf: "two-pass" seurat hvg via scanpy.get.aggregate#4013
ilan-gold wants to merge 5 commits intomainfrom
ig/two_pass_hvg_v3

Conversation

@ilan-gold
Copy link
Copy Markdown
Contributor

An idea that popped into my head for disk-bound datasets but likely also normal ones. This should, in theory, greatly improve on-disk access and produce speed ups for disk bound data by reducing the amount of i/o in the worst case, unordered scenario (while, I would guess, leaving in-memory datasets untocuhed or maybe improved thanks to memory access + more efficient mean/var).

  • Closes #
  • Tests included or not required because:

@ilan-gold ilan-gold added this to the 1.12.1 milestone Mar 26, 2026
@ilan-gold ilan-gold changed the title perf: "two-pass" seurat hvg3 via scanpy.get.aggregate perf: "two-pass" seurat hvg via scanpy.get.aggregate Mar 26, 2026
@codecov
Copy link
Copy Markdown

codecov bot commented Mar 26, 2026

❌ 1 Tests Failed:

Tests completed Failed Passed Skipped
2399 1 2398 339
View the top 1 failed test(s) by shortest run time
tests/test_highly_variable_genes.py::test_compare_to_seurat_v3
Stack Traces | 733s run time
#x1B[0m#x1B[37m@needs#x1B[39;49;00m.skmisc#x1B[90m#x1B[39;49;00m
    #x1B[94mdef#x1B[39;49;00m#x1B[90m #x1B[39;49;00m#x1B[92mtest_compare_to_seurat_v3#x1B[39;49;00m():#x1B[90m#x1B[39;49;00m
        #x1B[90m### test without batch#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
        seurat_hvg_info = pd.read_csv(FILE_V3)#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        pbmc = pbmc3k()#x1B[90m#x1B[39;49;00m
        sc.pp.filter_cells(pbmc, min_genes=#x1B[94m200#x1B[39;49;00m)  #x1B[90m# this doesnt do anything btw#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
        sc.pp.filter_genes(pbmc, min_cells=#x1B[94m3#x1B[39;49;00m)#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        pbmc_dense = pbmc.copy()#x1B[90m#x1B[39;49;00m
        pbmc_dense.X = pbmc_dense.X.toarray()#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        sc.pp.highly_variable_genes(pbmc, n_top_genes=#x1B[94m1000#x1B[39;49;00m, flavor=#x1B[33m"#x1B[39;49;00m#x1B[33mseurat_v3#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m)#x1B[90m#x1B[39;49;00m
        sc.pp.highly_variable_genes(pbmc_dense, n_top_genes=#x1B[94m1000#x1B[39;49;00m, flavor=#x1B[33m"#x1B[39;49;00m#x1B[33mseurat_v3#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m)#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        np.testing.assert_allclose(#x1B[90m#x1B[39;49;00m
            seurat_hvg_info[#x1B[33m"#x1B[39;49;00m#x1B[33mvariance#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            pbmc.var[#x1B[33m"#x1B[39;49;00m#x1B[33mvariances#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            rtol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            atol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
        np.testing.assert_allclose(#x1B[90m#x1B[39;49;00m
            seurat_hvg_info[#x1B[33m"#x1B[39;49;00m#x1B[33mvariance.standardized#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            pbmc.var[#x1B[33m"#x1B[39;49;00m#x1B[33mvariances_norm#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            rtol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            atol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
        np.testing.assert_allclose(#x1B[90m#x1B[39;49;00m
            pbmc_dense.var[#x1B[33m"#x1B[39;49;00m#x1B[33mvariances_norm#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            pbmc.var[#x1B[33m"#x1B[39;49;00m#x1B[33mvariances_norm#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
            rtol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            atol=#x1B[94m2e-05#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        #x1B[90m### test with batch#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
        #x1B[90m# introduce a dummy "technical covariate"; this is used in Seurat's SelectIntegrationFeatures#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
        pbmc.obs[#x1B[33m"#x1B[39;49;00m#x1B[33mdummy_tech#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m] = (#x1B[90m#x1B[39;49;00m
            #x1B[33m"#x1B[39;49;00m#x1B[33msource_#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m + pd.array([*#x1B[96mrange#x1B[39;49;00m(#x1B[94m1#x1B[39;49;00m, #x1B[94m6#x1B[39;49;00m), #x1B[94m5#x1B[39;49;00m]).repeat(#x1B[94m500#x1B[39;49;00m).astype(#x1B[33m"#x1B[39;49;00m#x1B[33mstring#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m)#x1B[90m#x1B[39;49;00m
        )[: pbmc.n_obs]#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        seurat_v3_paper = sc.pp.highly_variable_genes(#x1B[90m#x1B[39;49;00m
            pbmc,#x1B[90m#x1B[39;49;00m
            n_top_genes=#x1B[94m2000#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            flavor=#x1B[33m"#x1B[39;49;00m#x1B[33mseurat_v3_paper#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            batch_key=#x1B[33m"#x1B[39;49;00m#x1B[33mdummy_tech#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            inplace=#x1B[94mFalse#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        seurat_v3 = sc.pp.highly_variable_genes(#x1B[90m#x1B[39;49;00m
            pbmc,#x1B[90m#x1B[39;49;00m
            n_top_genes=#x1B[94m2000#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            flavor=#x1B[33m"#x1B[39;49;00m#x1B[33mseurat_v3#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            batch_key=#x1B[33m"#x1B[39;49;00m#x1B[33mdummy_tech#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            inplace=#x1B[94mFalse#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        seurat_hvg_info_batch = pd.read_csv(FILE_V3_BATCH)#x1B[90m#x1B[39;49;00m
        seu = pd.Index(seurat_hvg_info_batch[#x1B[33m"#x1B[39;49;00m#x1B[33mx#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m].to_numpy())#x1B[90m#x1B[39;49;00m
    #x1B[90m#x1B[39;49;00m
        gene_intersection_paper = seu.intersection(#x1B[90m#x1B[39;49;00m
            seurat_v3_paper[seurat_v3_paper[#x1B[33m"#x1B[39;49;00m#x1B[33mhighly_variable#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m]].index#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
        gene_intersection_impl = seu.intersection(#x1B[90m#x1B[39;49;00m
            seurat_v3[seurat_v3[#x1B[33m"#x1B[39;49;00m#x1B[33mhighly_variable#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m]].index#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m
>       #x1B[94massert#x1B[39;49;00m #x1B[96mlen#x1B[39;49;00m(gene_intersection_paper) / #x1B[94m2000#x1B[39;49;00m > #x1B[94m0.95#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
#x1B[1m#x1B[31mE       AssertionError: assert (1867 / 2000) > 0.95#x1B[0m
#x1B[1m#x1B[31mE        +  where 1867 = len(Index(['LYZ', 'GNLY', 'S100A9', 'FTL', 'FTH1', 'S100A8', 'HLA-DRA', 'CST3',\n       'CD74', 'NKG7',\n       ...\n       'CR1', 'SLC20A2', 'HBEGF', 'IL15', 'DNAJC9', 'LRRC59', 'ALG12', 'DPP4',\n       'CLUAP1', 'HCFC2'],\n      dtype='object', length=1867))#x1B[0m

#x1B[1m#x1B[31mtests/test_highly_variable_genes.py#x1B[0m:492: AssertionError

To view more test analytics, go to the Test Analytics Dashboard
📋 Got 3 mins? Take this short survey to help us improve Test Analytics.

@scverse-benchmark
Copy link
Copy Markdown

scverse-benchmark bot commented Mar 26, 2026

Benchmark changes

Change Before [b5efe54] After [fdc5653] Ratio Benchmark (Parameter)
- 450±2μs 388±3μs 0.86 preprocessing_counts.FastSuite.time_log1p('pbmc68k_reduced', 'counts')
! 8.57G failed n/a preprocessing_log.HVGSuite.peakmem_highly_variable_genes('seurat_v3')
! 1.97±0s failed n/a preprocessing_log.HVGSuite.time_highly_variable_genes('seurat_v3')

Warning

Some benchmarks failed

Comparison: https://github.com/scverse/scanpy/compare/b5efe54e062ceb3653f4a5732f19f227057bb916..fdc5653bb78715b8ba085a43c51ef102e9e66a4f
Last changed: Thu, 26 Mar 2026 17:14:34 +0000

More details: https://github.com/scverse/scanpy/pull/4013/checks?check_run_id=68723399697

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant