From 5f16e4c1f90a5cb6de74f596dca73ed8ac531f18 Mon Sep 17 00:00:00 2001 From: Gregory Jefferis Date: Fri, 3 Apr 2026 23:32:00 +0100 Subject: [PATCH 1/3] weighted jaccard functions for coconat --- DESCRIPTION | 4 +- NAMESPACE | 2 + R/RcppExports.R | 35 +++ man/weighted_jaccard_dense_cpp.Rd | 24 ++ man/weighted_jaccard_sparse_fill_cpp.Rd | 26 +++ src/RcppExports.cpp | 27 +++ src/weighted_jaccard.cpp | 291 ++++++++++++++++++++++++ tests/testthat/test-weighted-jaccard.R | 82 +++++++ 8 files changed, 489 insertions(+), 2 deletions(-) create mode 100644 man/weighted_jaccard_dense_cpp.Rd create mode 100644 man/weighted_jaccard_sparse_fill_cpp.Rd create mode 100644 src/weighted_jaccard.cpp create mode 100644 tests/testthat/test-weighted-jaccard.R diff --git a/DESCRIPTION b/DESCRIPTION index 5dff4ca..71d25a7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: natcpp Title: Fast C++ Primitives for the 'NeuroAnatomy Toolbox' -Version: 0.2 +Version: 0.3.0 Authors@R: person(given = "Gregory", family = "Jefferis", @@ -28,4 +28,4 @@ LinkingTo: Config/testthat/edition: 3 Encoding: UTF-8 Language: en-GB -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 diff --git a/NAMESPACE b/NAMESPACE index 504bfff..f614ccb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,5 +10,7 @@ export(c_sub2ind) export(c_topntail) export(c_topntail_list) export(c_total_cable) +export(weighted_jaccard_dense_cpp) +export(weighted_jaccard_sparse_fill_cpp) import(Rcpp) useDynLib(natcpp, .registration = TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R index c90d42d..90fcc19 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -135,3 +135,38 @@ c_EdgeListFromSegList <- function(L) { .Call(`_natcpp_c_EdgeListFromSegList`, L) } +#' Fill min_sums into a sparse pattern for weighted Jaccard similarity +#' +#' Given a dgCMatrix \code{x} and a sparsity pattern \code{pattern} (from +#' \code{crossprod(binarise(x))}), compute the sum of element-wise minima +#' for each pair of columns (or rows) that co-occur in at least one feature. +#' Intended for internal use by \code{coconat::jaccard_sim}. +#' +#' @param x A dgCMatrix (sparse column-compressed matrix) +#' @param pattern A dgCMatrix giving the output sparsity pattern +#' @param transpose If \code{FALSE}, compare columns; if \code{TRUE}, compare +#' rows +#' @return A numeric vector of min_sums aligned with the pattern's slot +#' structure +#' @export +weighted_jaccard_sparse_fill_cpp <- function(x, pattern, transpose = FALSE) { + .Call(`_natcpp_weighted_jaccard_sparse_fill_cpp`, x, pattern, transpose) +} + +#' Dense weighted Jaccard similarity via C++ +#' +#' Compute the full weighted Jaccard similarity matrix for a dgCMatrix using +#' an adaptive dense accumulation strategy. For small output matrices, uses a +#' feature-oriented loop; for large outputs, switches to a column-oriented +#' loop for better cache performance. Intended for internal use by +#' \code{coconat::jaccard_sim}. +#' +#' @param x A dgCMatrix (sparse column-compressed matrix) +#' @param transpose If \code{FALSE}, compare columns; if \code{TRUE}, compare +#' rows +#' @return A dense numeric similarity matrix +#' @export +weighted_jaccard_dense_cpp <- function(x, transpose = FALSE) { + .Call(`_natcpp_weighted_jaccard_dense_cpp`, x, transpose) +} + diff --git a/man/weighted_jaccard_dense_cpp.Rd b/man/weighted_jaccard_dense_cpp.Rd new file mode 100644 index 0000000..5269d34 --- /dev/null +++ b/man/weighted_jaccard_dense_cpp.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{weighted_jaccard_dense_cpp} +\alias{weighted_jaccard_dense_cpp} +\title{Dense weighted Jaccard similarity via C++} +\usage{ +weighted_jaccard_dense_cpp(x, transpose = FALSE) +} +\arguments{ +\item{x}{A dgCMatrix (sparse column-compressed matrix)} + +\item{transpose}{If \code{FALSE}, compare columns; if \code{TRUE}, compare +rows} +} +\value{ +A dense numeric similarity matrix +} +\description{ +Compute the full weighted Jaccard similarity matrix for a dgCMatrix using +an adaptive dense accumulation strategy. For small output matrices, uses a +feature-oriented loop; for large outputs, switches to a column-oriented +loop for better cache performance. Intended for internal use by +\code{coconat::jaccard_sim}. +} diff --git a/man/weighted_jaccard_sparse_fill_cpp.Rd b/man/weighted_jaccard_sparse_fill_cpp.Rd new file mode 100644 index 0000000..00b597e --- /dev/null +++ b/man/weighted_jaccard_sparse_fill_cpp.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{weighted_jaccard_sparse_fill_cpp} +\alias{weighted_jaccard_sparse_fill_cpp} +\title{Fill min_sums into a sparse pattern for weighted Jaccard similarity} +\usage{ +weighted_jaccard_sparse_fill_cpp(x, pattern, transpose = FALSE) +} +\arguments{ +\item{x}{A dgCMatrix (sparse column-compressed matrix)} + +\item{pattern}{A dgCMatrix giving the output sparsity pattern} + +\item{transpose}{If \code{FALSE}, compare columns; if \code{TRUE}, compare +rows} +} +\value{ +A numeric vector of min_sums aligned with the pattern's slot + structure +} +\description{ +Given a dgCMatrix \code{x} and a sparsity pattern \code{pattern} (from +\code{crossprod(binarise(x))}), compute the sum of element-wise minima +for each pair of columns (or rows) that co-occur in at least one feature. +Intended for internal use by \code{coconat::jaccard_sim}. +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 3415f85..6db9ee8 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -135,6 +135,31 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// weighted_jaccard_sparse_fill_cpp +NumericVector weighted_jaccard_sparse_fill_cpp(const S4& x, const S4& pattern, bool transpose); +RcppExport SEXP _natcpp_weighted_jaccard_sparse_fill_cpp(SEXP xSEXP, SEXP patternSEXP, SEXP transposeSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const S4& >::type x(xSEXP); + Rcpp::traits::input_parameter< const S4& >::type pattern(patternSEXP); + Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); + rcpp_result_gen = Rcpp::wrap(weighted_jaccard_sparse_fill_cpp(x, pattern, transpose)); + return rcpp_result_gen; +END_RCPP +} +// weighted_jaccard_dense_cpp +NumericMatrix weighted_jaccard_dense_cpp(const S4& x, bool transpose); +RcppExport SEXP _natcpp_weighted_jaccard_dense_cpp(SEXP xSEXP, SEXP transposeSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const S4& >::type x(xSEXP); + Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); + rcpp_result_gen = Rcpp::wrap(weighted_jaccard_dense_cpp(x, transpose)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_natcpp_c_seglengths", (DL_FUNC) &_natcpp_c_seglengths, 4}, @@ -147,6 +172,8 @@ static const R_CallMethodDef CallEntries[] = { {"_natcpp_c_topntail", (DL_FUNC) &_natcpp_c_topntail, 1}, {"_natcpp_c_topntail_list", (DL_FUNC) &_natcpp_c_topntail_list, 1}, {"_natcpp_c_EdgeListFromSegList", (DL_FUNC) &_natcpp_c_EdgeListFromSegList, 1}, + {"_natcpp_weighted_jaccard_sparse_fill_cpp", (DL_FUNC) &_natcpp_weighted_jaccard_sparse_fill_cpp, 3}, + {"_natcpp_weighted_jaccard_dense_cpp", (DL_FUNC) &_natcpp_weighted_jaccard_dense_cpp, 2}, {NULL, NULL, 0} }; diff --git a/src/weighted_jaccard.cpp b/src/weighted_jaccard.cpp new file mode 100644 index 0000000..58f4fae --- /dev/null +++ b/src/weighted_jaccard.cpp @@ -0,0 +1,291 @@ +#include +#include +#include + +using namespace Rcpp; + +//' Fill min_sums into a sparse pattern for weighted Jaccard similarity +//' +//' Given a dgCMatrix \code{x} and a sparsity pattern \code{pattern} (from +//' \code{crossprod(binarise(x))}), compute the sum of element-wise minima +//' for each pair of columns (or rows) that co-occur in at least one feature. +//' Intended for internal use by \code{coconat::jaccard_sim}. +//' +//' @param x A dgCMatrix (sparse column-compressed matrix) +//' @param pattern A dgCMatrix giving the output sparsity pattern +//' @param transpose If \code{FALSE}, compare columns; if \code{TRUE}, compare +//' rows +//' @return A numeric vector of min_sums aligned with the pattern's slot +//' structure +//' @export +// [[Rcpp::export]] +NumericVector weighted_jaccard_sparse_fill_cpp( + const S4& x, const S4& pattern, bool transpose = false) { + + // Original matrix slots (dgCMatrix: nr x nc) + IntegerVector dims = x.slot("Dim"); + IntegerVector xi = x.slot("i"); + IntegerVector xp = x.slot("p"); + NumericVector xx = x.slot("x"); + + const int nr = dims[0]; + const int nc = dims[1]; + const int nfeat = transpose ? nc : nr; + + // Pattern matrix slots (dgCMatrix: ncomp x ncomp) + IntegerVector Ai = pattern.slot("i"); + IntegerVector Ap = pattern.slot("p"); + IntegerVector Adims = pattern.slot("Dim"); + const int ncomp = Adims[0]; + const int annz = Ai.size(); + + // Output: min_sums values aligned with pattern's (i, p) structure + NumericVector out(annz, 0.0); + + // Build CSR for feature dimension: for each feature f, the items (columns + // or rows being compared) that are nonzero in that feature, with their values. + std::vector feat_count(nfeat, 0); + if (!transpose) { + for (int col = 0; col < nc; ++col) + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) + feat_count[xi[idx]]++; + } else { + for (int col = 0; col < nc; ++col) + feat_count[col] = xp[col + 1] - xp[col]; + } + + std::vector feat_offsets(nfeat + 1, 0); + for (int f = 0; f < nfeat; ++f) + feat_offsets[f + 1] = feat_offsets[f] + feat_count[f]; + + const int total_nnz = feat_offsets[nfeat]; + std::vector feat_items(total_nnz); // which item (col/row being compared) + std::vector feat_vals(total_nnz); // value of x at that position + std::vector feat_fill(feat_offsets.begin(), feat_offsets.end()); + + if (!transpose) { + for (int col = 0; col < nc; ++col) { + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + const int row = xi[idx]; + const int dest = feat_fill[row]++; + feat_items[dest] = col; + feat_vals[dest] = xx[idx]; + } + } + } else { + for (int col = 0; col < nc; ++col) { + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + const int dest = feat_fill[col]++; + feat_items[dest] = xi[idx]; + feat_vals[dest] = xx[idx]; + } + } + } + + // Also need item->features CSR: for each compared item, which features + // it participates in and with what value. + std::vector item_count(ncomp, 0); + for (int f = 0; f < nfeat; ++f) { + for (int idx = feat_offsets[f]; idx < feat_offsets[f + 1]; ++idx) + item_count[feat_items[idx]]++; + } + + std::vector item_offsets(ncomp + 1, 0); + for (int c = 0; c < ncomp; ++c) + item_offsets[c + 1] = item_offsets[c] + item_count[c]; + + std::vector item_feats(total_nnz); // which feature + std::vector item_vals(total_nnz); // value + std::vector item_fill(item_offsets.begin(), item_offsets.end()); + + for (int f = 0; f < nfeat; ++f) { + for (int idx = feat_offsets[f]; idx < feat_offsets[f + 1]; ++idx) { + const int item = feat_items[idx]; + const int dest = item_fill[item]++; + item_feats[dest] = f; + item_vals[dest] = feat_vals[idx]; + } + } + + // row_to_pos[row] = position in out[] for entry (row, current_col). + // Populated once per output column, then cleared. + std::vector row_to_pos(ncomp, -1); + + // Iterate over output columns + for (int cb = 0; cb < ncomp; ++cb) { + // Populate row_to_pos for pattern column cb + for (int idx = Ap[cb]; idx < Ap[cb + 1]; ++idx) + row_to_pos[Ai[idx]] = idx; + + // For each feature f where cb is nonzero + for (int fi = item_offsets[cb]; fi < item_offsets[cb + 1]; ++fi) { + const int f = item_feats[fi]; + const double vb = item_vals[fi]; + + // For each other item ca also in this feature, accumulate min into (ca, cb) + for (int idx = feat_offsets[f]; idx < feat_offsets[f + 1]; ++idx) { + const int ca = feat_items[idx]; + const int pos = row_to_pos[ca]; + if (pos >= 0) + out[pos] += std::min(feat_vals[idx], vb); + } + } + + // Clear row_to_pos + for (int idx = Ap[cb]; idx < Ap[cb + 1]; ++idx) + row_to_pos[Ai[idx]] = -1; + } + + return out; +} + +//' Dense weighted Jaccard similarity via C++ +//' +//' Compute the full weighted Jaccard similarity matrix for a dgCMatrix using +//' an adaptive dense accumulation strategy. For small output matrices, uses a +//' feature-oriented loop; for large outputs, switches to a column-oriented +//' loop for better cache performance. Intended for internal use by +//' \code{coconat::jaccard_sim}. +//' +//' @param x A dgCMatrix (sparse column-compressed matrix) +//' @param transpose If \code{FALSE}, compare columns; if \code{TRUE}, compare +//' rows +//' @return A dense numeric similarity matrix +//' @export +// [[Rcpp::export]] +NumericMatrix weighted_jaccard_dense_cpp(const S4& x, bool transpose = false) { + IntegerVector dims = x.slot("Dim"); + IntegerVector xi = x.slot("i"); + IntegerVector xp = x.slot("p"); + NumericVector xx = x.slot("x"); + + const int nr = dims[0]; + const int nc = dims[1]; + const int ncomp = transpose ? nr : nc; + const int nfeat = transpose ? nc : nr; + + // Totals per compared item + std::vector totals(ncomp, 0.0); + + // Build feature CSR: for each feature f, which items are nonzero + std::vector feat_count(nfeat, 0); + if (!transpose) { + for (int col = 0; col < nc; ++col) { + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + totals[col] += xx[idx]; + feat_count[xi[idx]]++; + } + } + } else { + for (int col = 0; col < nc; ++col) { + feat_count[col] = xp[col + 1] - xp[col]; + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + totals[xi[idx]] += xx[idx]; + } + } + } + + std::vector feat_offsets(nfeat + 1, 0); + for (int f = 0; f < nfeat; ++f) + feat_offsets[f + 1] = feat_offsets[f] + feat_count[f]; + + const int total_nnz = feat_offsets[nfeat]; + std::vector feat_items(total_nnz); + std::vector feat_vals(total_nnz); + std::vector feat_fill(feat_offsets.begin(), feat_offsets.end()); + + if (!transpose) { + for (int col = 0; col < nc; ++col) { + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + const int row = xi[idx]; + const int dest = feat_fill[row]++; + feat_items[dest] = col; + feat_vals[dest] = xx[idx]; + } + } + } else { + for (int col = 0; col < nc; ++col) { + for (int idx = xp[col]; idx < xp[col + 1]; ++idx) { + const int dest = feat_fill[col]++; + feat_items[dest] = xi[idx]; + feat_vals[dest] = xx[idx]; + } + } + } + + NumericMatrix out(ncomp, ncomp); + + // Threshold: ncomp^2 * 8 bytes > ~12MB (typical L3 share per core) + const bool use_colwise = (static_cast(ncomp) * ncomp * 8 > 12 * 1024 * 1024); + + if (use_colwise) { + // Column-oriented: build item->features CSR, iterate output columns + std::vector item_count(ncomp, 0); + for (int f = 0; f < nfeat; ++f) + for (int idx = feat_offsets[f]; idx < feat_offsets[f + 1]; ++idx) + item_count[feat_items[idx]]++; + + std::vector item_offsets(ncomp + 1, 0); + for (int c = 0; c < ncomp; ++c) + item_offsets[c + 1] = item_offsets[c] + item_count[c]; + + std::vector item_feats(total_nnz); + std::vector item_vals(total_nnz); + std::vector item_fill(item_offsets.begin(), item_offsets.end()); + + for (int f = 0; f < nfeat; ++f) { + for (int idx = feat_offsets[f]; idx < feat_offsets[f + 1]; ++idx) { + const int item = feat_items[idx]; + const int dest = item_fill[item]++; + item_feats[dest] = f; + item_vals[dest] = feat_vals[idx]; + } + } + + for (int cb = 0; cb < ncomp; ++cb) { + double* col = &out[static_cast(cb) * ncomp]; + for (int fi = item_offsets[cb]; fi < item_offsets[cb + 1]; ++fi) { + const int f = item_feats[fi]; + const double vb = item_vals[fi]; + for (int idx = feat_offsets[f]; idx < feat_offsets[f + 1]; ++idx) { + col[feat_items[idx]] += std::min(feat_vals[idx], vb); + } + } + } + } else { + // Feature-oriented: iterate features, scatter into output + for (int f = 0; f < nfeat; ++f) { + const int start = feat_offsets[f]; + const int end = feat_offsets[f + 1]; + const int k = end - start; + + for (int a = 0; a < k; ++a) { + const int ca = feat_items[start + a]; + const double va = feat_vals[start + a]; + out[static_cast(ca) * ncomp + ca] += va; + for (int b = a + 1; b < k; ++b) { + const int cb = feat_items[start + b]; + const double mv = std::min(va, feat_vals[start + b]); + out[static_cast(ca) * ncomp + cb] += mv; + out[static_cast(cb) * ncomp + ca] += mv; + } + } + } + } + + // Convert min_sums to similarity + for (int cb = 0; cb < ncomp; ++cb) { + for (int ca = 0; ca < ncomp; ++ca) { + if (ca == cb) { + out[static_cast(cb) * ncomp + ca] = 1.0; + } else { + const std::size_t pos = static_cast(cb) * ncomp + ca; + const double ms = out[pos]; + const double denom = totals[ca] + totals[cb] - ms; + out[pos] = (denom > 0.0) ? ms / denom : 0.0; + } + } + } + + return out; +} diff --git a/tests/testthat/test-weighted-jaccard.R b/tests/testthat/test-weighted-jaccard.R new file mode 100644 index 0000000..207ab32 --- /dev/null +++ b/tests/testthat/test-weighted-jaccard.R @@ -0,0 +1,82 @@ +test_that("weighted_jaccard_dense_cpp computes correct similarity", { + skip_if_not_installed("Matrix") + m <- Matrix::sparseMatrix( + i = c(1L, 2L, 1L, 2L, 3L, 3L), + j = c(1L, 1L, 2L, 2L, 2L, 3L), + x = c(4, 2, 1, 3, 3, 1), + dims = c(3L, 3L) + ) + + # Naive reference: sim[a,b] = sum(pmin(col_a, col_b)) / sum(pmax(col_a, col_b)) + dm <- as.matrix(m) + n <- ncol(dm) + ref <- matrix(0, n, n) + for (a in seq_len(n)) { + for (b in seq_len(n)) { + mins <- sum(pmin(dm[, a], dm[, b])) + maxs <- sum(pmax(dm[, a], dm[, b])) + ref[a, b] <- if (maxs > 0) mins / maxs else 0 + } + } + diag(ref) <- 1 + + sim <- weighted_jaccard_dense_cpp(m, transpose = FALSE) + expect_equal(sim, ref, tolerance = 1e-12) +}) + +test_that("weighted_jaccard_dense_cpp transpose works", { + skip_if_not_installed("Matrix") + m <- Matrix::sparseMatrix( + i = c(1L, 2L, 1L, 2L, 3L, 3L), + j = c(1L, 1L, 2L, 2L, 2L, 3L), + x = c(4, 2, 1, 3, 3, 1), + dims = c(3L, 3L) + ) + + dm <- as.matrix(m) + nr <- nrow(dm) + ref_t <- matrix(0, nr, nr) + for (a in seq_len(nr)) { + for (b in seq_len(nr)) { + mins <- sum(pmin(dm[a, ], dm[b, ])) + maxs <- sum(pmax(dm[a, ], dm[b, ])) + ref_t[a, b] <- if (maxs > 0) mins / maxs else 0 + } + } + diag(ref_t) <- 1 + + sim_t <- weighted_jaccard_dense_cpp(m, transpose = TRUE) + expect_equal(sim_t, ref_t, tolerance = 1e-12) +}) + +test_that("weighted_jaccard_sparse_fill_cpp matches dense", { + skip_if_not_installed("Matrix") + m <- Matrix::sparseMatrix( + i = c(1L, 2L, 1L, 2L, 3L, 3L), + j = c(1L, 1L, 2L, 2L, 2L, 3L), + x = c(4, 2, 1, 3, 3, 1), + dims = c(3L, 3L) + ) + + # Dense reference + sim_dense <- weighted_jaccard_dense_cpp(m, transpose = FALSE) + + # Sparse fill: build pattern from binarised crossprod + b <- m + b@x <- rep(1, length(b@x)) + A <- methods::as(Matrix::crossprod(b), "generalMatrix") + totals <- Matrix::colSums(m) + + A@x <- weighted_jaccard_sparse_fill_cpp(m, A, transpose = FALSE) + + # Convert min_sums to similarity + col_idx <- rep(seq_along(diff(A@p)), diff(A@p)) + row_idx <- A@i + 1L + denom <- totals[row_idx] + totals[col_idx] - A@x + nonzero <- denom != 0 + A@x[nonzero] <- A@x[nonzero] / denom[nonzero] + A@x[!nonzero] <- 0 + Matrix::diag(A) <- 1 + + expect_equal(as.matrix(A), sim_dense, tolerance = 1e-12) +}) From cc7e20f21c75dcb1a9dd2025d5a7b867c6851d21 Mon Sep 17 00:00:00 2001 From: Gregory Jefferis Date: Sat, 4 Apr 2026 09:25:36 +0100 Subject: [PATCH 2/3] suggest Matrix package --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 71d25a7..3d39b65 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,6 +21,7 @@ BugReports: https://github.com/natverse/natcpp/issues Imports: Rcpp (>= 1.0.6) Suggests: + Matrix, spelling, testthat (>= 3.0.0) LinkingTo: From 1895e525871b987a35f89a4290e438ba88dcd7db Mon Sep 17 00:00:00 2001 From: Gregory Jefferis Date: Sat, 4 Apr 2026 16:03:30 +0100 Subject: [PATCH 3/3] make c_weighted_jaccard_sparse a full implementation * this mean that it can be tested for equivalence * and rename both functions --- DESCRIPTION | 6 +-- NAMESPACE | 4 +- R/RcppExports.R | 25 +++------ R/weighted_jaccard.R | 51 +++++++++++++++++++ ...nse_cpp.Rd => c_weighted_jaccard_dense.Rd} | 8 +-- man/c_weighted_jaccard_sparse.Rd | 34 +++++++++++++ man/weighted_jaccard_sparse_fill_cpp.Rd | 26 ---------- src/RcppExports.cpp | 20 ++++---- src/weighted_jaccard.cpp | 21 ++------ tests/testthat/test-weighted-jaccard.R | 39 ++++---------- 10 files changed, 125 insertions(+), 109 deletions(-) create mode 100644 R/weighted_jaccard.R rename man/{weighted_jaccard_dense_cpp.Rd => c_weighted_jaccard_dense.Rd} (77%) create mode 100644 man/c_weighted_jaccard_sparse.Rd delete mode 100644 man/weighted_jaccard_sparse_fill_cpp.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 3d39b65..10934b6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,10 +18,10 @@ Description: Fast functions implemented in C++ via 'Rcpp' to support the License: GPL (>= 3) URL: https://github.com/natverse/natcpp, https://natverse.org/natcpp/ BugReports: https://github.com/natverse/natcpp/issues -Imports: - Rcpp (>= 1.0.6) -Suggests: +Imports: Matrix, + Rcpp (>= 1.0.6) +Suggests: spelling, testthat (>= 3.0.0) LinkingTo: diff --git a/NAMESPACE b/NAMESPACE index f614ccb..355a815 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,7 +10,7 @@ export(c_sub2ind) export(c_topntail) export(c_topntail_list) export(c_total_cable) -export(weighted_jaccard_dense_cpp) -export(weighted_jaccard_sparse_fill_cpp) +export(c_weighted_jaccard_dense) +export(c_weighted_jaccard_sparse) import(Rcpp) useDynLib(natcpp, .registration = TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R index 90fcc19..6474e92 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -135,22 +135,9 @@ c_EdgeListFromSegList <- function(L) { .Call(`_natcpp_c_EdgeListFromSegList`, L) } -#' Fill min_sums into a sparse pattern for weighted Jaccard similarity -#' -#' Given a dgCMatrix \code{x} and a sparsity pattern \code{pattern} (from -#' \code{crossprod(binarise(x))}), compute the sum of element-wise minima -#' for each pair of columns (or rows) that co-occur in at least one feature. -#' Intended for internal use by \code{coconat::jaccard_sim}. -#' -#' @param x A dgCMatrix (sparse column-compressed matrix) -#' @param pattern A dgCMatrix giving the output sparsity pattern -#' @param transpose If \code{FALSE}, compare columns; if \code{TRUE}, compare -#' rows -#' @return A numeric vector of min_sums aligned with the pattern's slot -#' structure -#' @export -weighted_jaccard_sparse_fill_cpp <- function(x, pattern, transpose = FALSE) { - .Call(`_natcpp_weighted_jaccard_sparse_fill_cpp`, x, pattern, transpose) +#' @noRd +weighted_jaccard_sparse_fill <- function(x, pattern, transpose = FALSE) { + .Call(`_natcpp_weighted_jaccard_sparse_fill`, x, pattern, transpose) } #' Dense weighted Jaccard similarity via C++ @@ -159,14 +146,14 @@ weighted_jaccard_sparse_fill_cpp <- function(x, pattern, transpose = FALSE) { #' an adaptive dense accumulation strategy. For small output matrices, uses a #' feature-oriented loop; for large outputs, switches to a column-oriented #' loop for better cache performance. Intended for internal use by -#' \code{coconat::jaccard_sim}. +#' \code{coconat::jaccard_sim}. See also \code{\link{c_weighted_jaccard_sparse}}. #' #' @param x A dgCMatrix (sparse column-compressed matrix) #' @param transpose If \code{FALSE}, compare columns; if \code{TRUE}, compare #' rows #' @return A dense numeric similarity matrix #' @export -weighted_jaccard_dense_cpp <- function(x, transpose = FALSE) { - .Call(`_natcpp_weighted_jaccard_dense_cpp`, x, transpose) +c_weighted_jaccard_dense <- function(x, transpose = FALSE) { + .Call(`_natcpp_c_weighted_jaccard_dense`, x, transpose) } diff --git a/R/weighted_jaccard.R b/R/weighted_jaccard.R new file mode 100644 index 0000000..eb4eb35 --- /dev/null +++ b/R/weighted_jaccard.R @@ -0,0 +1,51 @@ +#' Sparse weighted Jaccard similarity via C++ +#' +#' Compute the weighted Jaccard similarity matrix for a dgCMatrix, returning a +#' sparse result. Uses \code{weighted_jaccard_sparse_fill} to compute +#' min-sums only for column (or row) pairs that share at least one non-zero +#' feature, then normalises to similarity. +#' +#' @param x A dgCMatrix (sparse column-compressed matrix) +#' @param transpose If \code{FALSE} (default), compare columns; if +#' \code{TRUE}, compare rows. +#' @return A sparse dgCMatrix similarity matrix +#' @export +#' @seealso \code{\link{c_weighted_jaccard_dense}} for the dense equivalent +#' @examples +#' \dontrun{ +#' library(Matrix) +#' m <- sparseMatrix(i = c(1,2,1,2,3,3), j = c(1,1,2,2,2,3), +#' x = c(4,2,1,3,3,1), dims = c(3,3)) +#' c_weighted_jaccard_sparse(m) +#' } +c_weighted_jaccard_sparse <- function(x, transpose = FALSE) { + crossfun <- if (transpose) Matrix::tcrossprod else Matrix::crossprod + n <- if (transpose) nrow(x) else ncol(x) + + if (length(x@x) == 0L) { + return(Matrix::sparseMatrix(i = seq_len(n), j = seq_len(n), x = 1, + dims = c(n, n))) + } + + # Sparsity pattern from binarised crossprod + b <- x + b@x <- rep(1, length(b@x)) + A <- as(crossfun(b), "generalMatrix") + + # Column/row totals for denominator + totals <- if (transpose) Matrix::rowSums(x) else Matrix::colSums(x) + + # Fill in min_sums using C++ + A@x <- weighted_jaccard_sparse_fill(x, A, transpose = transpose) + + # Convert min_sums to similarity: sim = ms / (s_i + s_j - ms) + col_idx <- rep(seq_along(diff(A@p)), diff(A@p)) + row_idx <- A@i + 1L + denom <- totals[row_idx] + totals[col_idx] - A@x + nonzero <- denom != 0 + A@x[nonzero] <- A@x[nonzero] / denom[nonzero] + A@x[!nonzero] <- 0 + Matrix::diag(A) <- 1 + + A +} diff --git a/man/weighted_jaccard_dense_cpp.Rd b/man/c_weighted_jaccard_dense.Rd similarity index 77% rename from man/weighted_jaccard_dense_cpp.Rd rename to man/c_weighted_jaccard_dense.Rd index 5269d34..d63a502 100644 --- a/man/weighted_jaccard_dense_cpp.Rd +++ b/man/c_weighted_jaccard_dense.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/RcppExports.R -\name{weighted_jaccard_dense_cpp} -\alias{weighted_jaccard_dense_cpp} +\name{c_weighted_jaccard_dense} +\alias{c_weighted_jaccard_dense} \title{Dense weighted Jaccard similarity via C++} \usage{ -weighted_jaccard_dense_cpp(x, transpose = FALSE) +c_weighted_jaccard_dense(x, transpose = FALSE) } \arguments{ \item{x}{A dgCMatrix (sparse column-compressed matrix)} @@ -20,5 +20,5 @@ Compute the full weighted Jaccard similarity matrix for a dgCMatrix using an adaptive dense accumulation strategy. For small output matrices, uses a feature-oriented loop; for large outputs, switches to a column-oriented loop for better cache performance. Intended for internal use by -\code{coconat::jaccard_sim}. +\code{coconat::jaccard_sim}. See also \code{\link{c_weighted_jaccard_sparse}}. } diff --git a/man/c_weighted_jaccard_sparse.Rd b/man/c_weighted_jaccard_sparse.Rd new file mode 100644 index 0000000..7f41e52 --- /dev/null +++ b/man/c_weighted_jaccard_sparse.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/weighted_jaccard.R +\name{c_weighted_jaccard_sparse} +\alias{c_weighted_jaccard_sparse} +\title{Sparse weighted Jaccard similarity via C++} +\usage{ +c_weighted_jaccard_sparse(x, transpose = FALSE) +} +\arguments{ +\item{x}{A dgCMatrix (sparse column-compressed matrix)} + +\item{transpose}{If \code{FALSE} (default), compare columns; if +\code{TRUE}, compare rows.} +} +\value{ +A sparse dgCMatrix similarity matrix +} +\description{ +Compute the weighted Jaccard similarity matrix for a dgCMatrix, returning a +sparse result. Uses \code{weighted_jaccard_sparse_fill} to compute +min-sums only for column (or row) pairs that share at least one non-zero +feature, then normalises to similarity. +} +\examples{ +\dontrun{ +library(Matrix) +m <- sparseMatrix(i = c(1,2,1,2,3,3), j = c(1,1,2,2,2,3), + x = c(4,2,1,3,3,1), dims = c(3,3)) +c_weighted_jaccard_sparse(m) +} +} +\seealso{ +\code{\link{c_weighted_jaccard_dense}} for the dense equivalent +} diff --git a/man/weighted_jaccard_sparse_fill_cpp.Rd b/man/weighted_jaccard_sparse_fill_cpp.Rd deleted file mode 100644 index 00b597e..0000000 --- a/man/weighted_jaccard_sparse_fill_cpp.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RcppExports.R -\name{weighted_jaccard_sparse_fill_cpp} -\alias{weighted_jaccard_sparse_fill_cpp} -\title{Fill min_sums into a sparse pattern for weighted Jaccard similarity} -\usage{ -weighted_jaccard_sparse_fill_cpp(x, pattern, transpose = FALSE) -} -\arguments{ -\item{x}{A dgCMatrix (sparse column-compressed matrix)} - -\item{pattern}{A dgCMatrix giving the output sparsity pattern} - -\item{transpose}{If \code{FALSE}, compare columns; if \code{TRUE}, compare -rows} -} -\value{ -A numeric vector of min_sums aligned with the pattern's slot - structure -} -\description{ -Given a dgCMatrix \code{x} and a sparsity pattern \code{pattern} (from -\code{crossprod(binarise(x))}), compute the sum of element-wise minima -for each pair of columns (or rows) that co-occur in at least one feature. -Intended for internal use by \code{coconat::jaccard_sim}. -} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 6db9ee8..a0e9c71 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -135,28 +135,28 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// weighted_jaccard_sparse_fill_cpp -NumericVector weighted_jaccard_sparse_fill_cpp(const S4& x, const S4& pattern, bool transpose); -RcppExport SEXP _natcpp_weighted_jaccard_sparse_fill_cpp(SEXP xSEXP, SEXP patternSEXP, SEXP transposeSEXP) { +// weighted_jaccard_sparse_fill +NumericVector weighted_jaccard_sparse_fill(const S4& x, const S4& pattern, bool transpose); +RcppExport SEXP _natcpp_weighted_jaccard_sparse_fill(SEXP xSEXP, SEXP patternSEXP, SEXP transposeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const S4& >::type x(xSEXP); Rcpp::traits::input_parameter< const S4& >::type pattern(patternSEXP); Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); - rcpp_result_gen = Rcpp::wrap(weighted_jaccard_sparse_fill_cpp(x, pattern, transpose)); + rcpp_result_gen = Rcpp::wrap(weighted_jaccard_sparse_fill(x, pattern, transpose)); return rcpp_result_gen; END_RCPP } -// weighted_jaccard_dense_cpp -NumericMatrix weighted_jaccard_dense_cpp(const S4& x, bool transpose); -RcppExport SEXP _natcpp_weighted_jaccard_dense_cpp(SEXP xSEXP, SEXP transposeSEXP) { +// c_weighted_jaccard_dense +NumericMatrix c_weighted_jaccard_dense(const S4& x, bool transpose); +RcppExport SEXP _natcpp_c_weighted_jaccard_dense(SEXP xSEXP, SEXP transposeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const S4& >::type x(xSEXP); Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); - rcpp_result_gen = Rcpp::wrap(weighted_jaccard_dense_cpp(x, transpose)); + rcpp_result_gen = Rcpp::wrap(c_weighted_jaccard_dense(x, transpose)); return rcpp_result_gen; END_RCPP } @@ -172,8 +172,8 @@ static const R_CallMethodDef CallEntries[] = { {"_natcpp_c_topntail", (DL_FUNC) &_natcpp_c_topntail, 1}, {"_natcpp_c_topntail_list", (DL_FUNC) &_natcpp_c_topntail_list, 1}, {"_natcpp_c_EdgeListFromSegList", (DL_FUNC) &_natcpp_c_EdgeListFromSegList, 1}, - {"_natcpp_weighted_jaccard_sparse_fill_cpp", (DL_FUNC) &_natcpp_weighted_jaccard_sparse_fill_cpp, 3}, - {"_natcpp_weighted_jaccard_dense_cpp", (DL_FUNC) &_natcpp_weighted_jaccard_dense_cpp, 2}, + {"_natcpp_weighted_jaccard_sparse_fill", (DL_FUNC) &_natcpp_weighted_jaccard_sparse_fill, 3}, + {"_natcpp_c_weighted_jaccard_dense", (DL_FUNC) &_natcpp_c_weighted_jaccard_dense, 2}, {NULL, NULL, 0} }; diff --git a/src/weighted_jaccard.cpp b/src/weighted_jaccard.cpp index 58f4fae..297bd32 100644 --- a/src/weighted_jaccard.cpp +++ b/src/weighted_jaccard.cpp @@ -4,22 +4,9 @@ using namespace Rcpp; -//' Fill min_sums into a sparse pattern for weighted Jaccard similarity -//' -//' Given a dgCMatrix \code{x} and a sparsity pattern \code{pattern} (from -//' \code{crossprod(binarise(x))}), compute the sum of element-wise minima -//' for each pair of columns (or rows) that co-occur in at least one feature. -//' Intended for internal use by \code{coconat::jaccard_sim}. -//' -//' @param x A dgCMatrix (sparse column-compressed matrix) -//' @param pattern A dgCMatrix giving the output sparsity pattern -//' @param transpose If \code{FALSE}, compare columns; if \code{TRUE}, compare -//' rows -//' @return A numeric vector of min_sums aligned with the pattern's slot -//' structure -//' @export +//' @noRd // [[Rcpp::export]] -NumericVector weighted_jaccard_sparse_fill_cpp( +NumericVector weighted_jaccard_sparse_fill( const S4& x, const S4& pattern, bool transpose = false) { // Original matrix slots (dgCMatrix: nr x nc) @@ -145,7 +132,7 @@ NumericVector weighted_jaccard_sparse_fill_cpp( //' an adaptive dense accumulation strategy. For small output matrices, uses a //' feature-oriented loop; for large outputs, switches to a column-oriented //' loop for better cache performance. Intended for internal use by -//' \code{coconat::jaccard_sim}. +//' \code{coconat::jaccard_sim}. See also \code{\link{c_weighted_jaccard_sparse}}. //' //' @param x A dgCMatrix (sparse column-compressed matrix) //' @param transpose If \code{FALSE}, compare columns; if \code{TRUE}, compare @@ -153,7 +140,7 @@ NumericVector weighted_jaccard_sparse_fill_cpp( //' @return A dense numeric similarity matrix //' @export // [[Rcpp::export]] -NumericMatrix weighted_jaccard_dense_cpp(const S4& x, bool transpose = false) { +NumericMatrix c_weighted_jaccard_dense(const S4& x, bool transpose = false) { IntegerVector dims = x.slot("Dim"); IntegerVector xi = x.slot("i"); IntegerVector xp = x.slot("p"); diff --git a/tests/testthat/test-weighted-jaccard.R b/tests/testthat/test-weighted-jaccard.R index 207ab32..e1c1f11 100644 --- a/tests/testthat/test-weighted-jaccard.R +++ b/tests/testthat/test-weighted-jaccard.R @@ -1,5 +1,4 @@ -test_that("weighted_jaccard_dense_cpp computes correct similarity", { - skip_if_not_installed("Matrix") +test_that("c_weighted_jaccard_dense computes correct similarity", { m <- Matrix::sparseMatrix( i = c(1L, 2L, 1L, 2L, 3L, 3L), j = c(1L, 1L, 2L, 2L, 2L, 3L), @@ -20,12 +19,11 @@ test_that("weighted_jaccard_dense_cpp computes correct similarity", { } diag(ref) <- 1 - sim <- weighted_jaccard_dense_cpp(m, transpose = FALSE) + sim <- c_weighted_jaccard_dense(m, transpose = FALSE) expect_equal(sim, ref, tolerance = 1e-12) }) -test_that("weighted_jaccard_dense_cpp transpose works", { - skip_if_not_installed("Matrix") +test_that("c_weighted_jaccard_dense transpose works", { m <- Matrix::sparseMatrix( i = c(1L, 2L, 1L, 2L, 3L, 3L), j = c(1L, 1L, 2L, 2L, 2L, 3L), @@ -45,12 +43,11 @@ test_that("weighted_jaccard_dense_cpp transpose works", { } diag(ref_t) <- 1 - sim_t <- weighted_jaccard_dense_cpp(m, transpose = TRUE) + sim_t <- c_weighted_jaccard_dense(m, transpose = TRUE) expect_equal(sim_t, ref_t, tolerance = 1e-12) }) -test_that("weighted_jaccard_sparse_fill_cpp matches dense", { - skip_if_not_installed("Matrix") +test_that("c_weighted_jaccard_sparse matches dense", { m <- Matrix::sparseMatrix( i = c(1L, 2L, 1L, 2L, 3L, 3L), j = c(1L, 1L, 2L, 2L, 2L, 3L), @@ -58,25 +55,11 @@ test_that("weighted_jaccard_sparse_fill_cpp matches dense", { dims = c(3L, 3L) ) - # Dense reference - sim_dense <- weighted_jaccard_dense_cpp(m, transpose = FALSE) + sim_dense <- c_weighted_jaccard_dense(m, transpose = FALSE) + sim_sparse <- c_weighted_jaccard_sparse(m, transpose = FALSE) + expect_equal(as.matrix(sim_sparse), sim_dense, tolerance = 1e-12) - # Sparse fill: build pattern from binarised crossprod - b <- m - b@x <- rep(1, length(b@x)) - A <- methods::as(Matrix::crossprod(b), "generalMatrix") - totals <- Matrix::colSums(m) - - A@x <- weighted_jaccard_sparse_fill_cpp(m, A, transpose = FALSE) - - # Convert min_sums to similarity - col_idx <- rep(seq_along(diff(A@p)), diff(A@p)) - row_idx <- A@i + 1L - denom <- totals[row_idx] + totals[col_idx] - A@x - nonzero <- denom != 0 - A@x[nonzero] <- A@x[nonzero] / denom[nonzero] - A@x[!nonzero] <- 0 - Matrix::diag(A) <- 1 - - expect_equal(as.matrix(A), sim_dense, tolerance = 1e-12) + sim_dense_t <- c_weighted_jaccard_dense(m, transpose = TRUE) + sim_sparse_t <- c_weighted_jaccard_sparse(m, transpose = TRUE) + expect_equal(as.matrix(sim_sparse_t), sim_dense_t, tolerance = 1e-12) })