From 06805e7660df3ac7ab973214b43169bc6d1d2976 Mon Sep 17 00:00:00 2001 From: Liudeng Zhang Date: Wed, 25 Mar 2026 15:00:55 -0500 Subject: [PATCH] Close census connection in extract_czi_markers cellxgene_census.open_soma() was never closed in extract_czi_markers, leaking the SOMA connection. Wrap the census usage in try/finally to ensure census.close() is always called, matching the pattern already used in download_czi_reference. --- spatialagent/tool/databases.py | 168 +++++++++++++++++---------------- 1 file changed, 86 insertions(+), 82 deletions(-) diff --git a/spatialagent/tool/databases.py b/spatialagent/tool/databases.py index 377f835..4b0b3df 100644 --- a/spatialagent/tool/databases.py +++ b/spatialagent/tool/databases.py @@ -447,88 +447,92 @@ def get_cellguide_file(relpth, snapshot=LATEST_SNAPSHOT): # Read from CZI Census census = cellxgene_census.open_soma(census_version="latest") - all_results = [] - - for did in dataset_ids: - # Query dataset - query = f'dataset_id == "{did}"' - adata = cellxgene_census.get_anndata(census, organism, obs_value_filter=query) - - # Get cell types - cell_types = adata.obs["cell_type"].value_counts() - - for cell_type, count in cell_types.items(): - # Get cell_type_id with error handling for empty results - cell_type_mask = adata.obs["cell_type"] == cell_type - cell_type_ids = adata.obs[cell_type_mask]["cell_type_ontology_term_id"].values - - if len(cell_type_ids) == 0: - continue # Skip if no cell_type_ontology_term_id found - - cell_type_id = cell_type_ids[0] - - # Get marker genes from CellGuide - # Note: CellGuide uses underscore format (CL_0000182) not colon format (CL:0000182) - cellguide_id = cell_type_id.replace(":", "_") - - comp_genes = [] - cano_genes = [] - - # Limit marker genes to top N for readability (CellGuide can return 500+ genes) - MAX_MARKER_GENES = 100 - - # Helper to convert gene symbols based on organism - # CellGuide returns mouse-format symbols (title case like 'Grin2b') - # Human genes should be uppercase (GRIN2B), mouse stays title case - def normalize_gene_symbol(gene: str) -> str: - if organism == "Homo sapiens": - return gene.upper() - return gene # Keep mouse format as-is - - try: - comp_markers = get_cellguide_file(f"computational_marker_genes/{cellguide_id}.json") - if comp_markers.status_code == 200 and comp_markers.text: - comp_markers_df = pd.DataFrame.from_records(comp_markers.json()) - # Gene symbol is in 'symbol' column, not 'marker_gene' - if "symbol" in comp_markers_df.columns: - comp_genes = [normalize_gene_symbol(g) for g in comp_markers_df["symbol"].tolist()[:MAX_MARKER_GENES]] - elif "marker_gene" in comp_markers_df.columns: - comp_genes = [normalize_gene_symbol(g) for g in comp_markers_df["marker_gene"].tolist()[:MAX_MARKER_GENES]] - except Exception: - pass # CellGuide may not have data for all cell types - - try: - cano_markers = get_cellguide_file(f"canonical_marker_genes/{cellguide_id}.json") - if cano_markers.status_code == 200 and cano_markers.text: - cano_markers_df = pd.DataFrame.from_records(cano_markers.json()) - if "symbol" in cano_markers_df.columns: - cano_genes = [normalize_gene_symbol(g) for g in cano_markers_df["symbol"].tolist()[:MAX_MARKER_GENES]] - elif "marker_gene" in cano_markers_df.columns: - cano_genes = [normalize_gene_symbol(g) for g in cano_markers_df["marker_gene"].tolist()[:MAX_MARKER_GENES]] - except Exception: - pass - - all_results.append({ - "cell_type": cell_type, - "cell_type_id": cell_type_id, - "n_cells": count, - "marker_genes": comp_genes, - "cano_marker_genes": cano_genes, - }) - - # Save - df_results = pd.DataFrame(all_results) - df_results.to_csv(save_csv, index=False) - - # Count how many cell types have marker genes - n_with_markers = sum(1 for r in all_results if r["marker_genes"] or r["cano_marker_genes"]) - n_without_markers = len(all_results) - n_with_markers - - msg = f"Successfully processed {len(dataset_ids)} CZI dataset(s) with {len(all_results)} cell types. Saved to {save_csv}" - if n_without_markers > 0: - msg += f"\nNote: {n_with_markers} cell types have marker genes from CellGuide, {n_without_markers} do not (may need PanglaoDB/CellMarker2 lookup)." - - return msg + try: + all_results = [] + + for did in dataset_ids: + # Query dataset + query = f'dataset_id == "{did}"' + adata = cellxgene_census.get_anndata(census, organism, obs_value_filter=query) + + # Get cell types + cell_types = adata.obs["cell_type"].value_counts() + + for cell_type, count in cell_types.items(): + # Get cell_type_id with error handling for empty results + cell_type_mask = adata.obs["cell_type"] == cell_type + cell_type_ids = adata.obs[cell_type_mask]["cell_type_ontology_term_id"].values + + if len(cell_type_ids) == 0: + continue # Skip if no cell_type_ontology_term_id found + + cell_type_id = cell_type_ids[0] + + # Get marker genes from CellGuide + # Note: CellGuide uses underscore format (CL_0000182) not colon format (CL:0000182) + cellguide_id = cell_type_id.replace(":", "_") + + comp_genes = [] + cano_genes = [] + + # Limit marker genes to top N for readability (CellGuide can return 500+ genes) + MAX_MARKER_GENES = 100 + + # Helper to convert gene symbols based on organism + # CellGuide returns mouse-format symbols (title case like 'Grin2b') + # Human genes should be uppercase (GRIN2B), mouse stays title case + def normalize_gene_symbol(gene: str) -> str: + if organism == "Homo sapiens": + return gene.upper() + return gene # Keep mouse format as-is + + try: + comp_markers = get_cellguide_file(f"computational_marker_genes/{cellguide_id}.json") + if comp_markers.status_code == 200 and comp_markers.text: + comp_markers_df = pd.DataFrame.from_records(comp_markers.json()) + # Gene symbol is in 'symbol' column, not 'marker_gene' + if "symbol" in comp_markers_df.columns: + comp_genes = [normalize_gene_symbol(g) for g in comp_markers_df["symbol"].tolist()[:MAX_MARKER_GENES]] + elif "marker_gene" in comp_markers_df.columns: + comp_genes = [normalize_gene_symbol(g) for g in comp_markers_df["marker_gene"].tolist()[:MAX_MARKER_GENES]] + except Exception: + pass # CellGuide may not have data for all cell types + + try: + cano_markers = get_cellguide_file(f"canonical_marker_genes/{cellguide_id}.json") + if cano_markers.status_code == 200 and cano_markers.text: + cano_markers_df = pd.DataFrame.from_records(cano_markers.json()) + if "symbol" in cano_markers_df.columns: + cano_genes = [normalize_gene_symbol(g) for g in cano_markers_df["symbol"].tolist()[:MAX_MARKER_GENES]] + elif "marker_gene" in cano_markers_df.columns: + cano_genes = [normalize_gene_symbol(g) for g in cano_markers_df["marker_gene"].tolist()[:MAX_MARKER_GENES]] + except Exception: + pass + + all_results.append({ + "cell_type": cell_type, + "cell_type_id": cell_type_id, + "n_cells": count, + "marker_genes": comp_genes, + "cano_marker_genes": cano_genes, + }) + + # Save + df_results = pd.DataFrame(all_results) + df_results.to_csv(save_csv, index=False) + + # Count how many cell types have marker genes + n_with_markers = sum(1 for r in all_results if r["marker_genes"] or r["cano_marker_genes"]) + n_without_markers = len(all_results) - n_with_markers + + msg = f"Successfully processed {len(dataset_ids)} CZI dataset(s) with {len(all_results)} cell types. Saved to {save_csv}" + if n_without_markers > 0: + msg += f"\nNote: {n_with_markers} cell types have marker genes from CellGuide, {n_without_markers} do not (may need PanglaoDB/CellMarker2 lookup)." + + return msg + + finally: + census.close() # =============================================================================