VecGrep · iamvirul · Feb 26, 2026 · Feb 26, 2026
@@ -11,7 +11,7 @@ Instead of grepping 50 files and sending 30,000 tokens to Claude, VecGrep return
 ## How it works
 
 1. **Chunk** — Parses source files with tree-sitter to extract semantic units (functions, classes, methods)
-2. **Embed** — Encodes each chunk locally using [`all-MiniLM-L6-v2-code-search-512`](https://huggingface.co/isuruwijesiri/all-MiniLM-L6-v2-code-search-512) (384-dim, ~80MB one-time download), automatically using Metal (Apple Silicon), CUDA (NVIDIA), or CPU
+2. **Embed** — Encodes each chunk locally using [`all-MiniLM-L6-v2-code-search-512`](https://huggingface.co/isuruwijesiri/all-MiniLM-L6-v2-code-search-512) (384-dim, ~80MB one-time download) via the fastembed ONNX backend (~100ms startup) or PyTorch, automatically using Metal (Apple Silicon), CUDA (NVIDIA), or CPU
 3. **Store** — Saves embeddings + metadata in LanceDB under `~/.vecgrep/<project_hash>/`
 4. **Search** — ANN index (IVF-PQ) for fast approximate search on large codebases
 
@@ -106,6 +106,32 @@ Index status for: /path/to/myproject
   Index size:     28.4 MB
 ```
 
+## Configuration
+
+VecGrep can be tuned via environment variables:
+
+| Variable | Default | Description |
+|---|---|---|
+| `VECGREP_BACKEND` | `onnx` | Embedding backend: `onnx` (fastembed, fast startup) or `torch` (sentence-transformers, any HF model) |
+| `VECGREP_MODEL` | `isuruwijesiri/all-MiniLM-L6-v2-code-search-512` | HuggingFace model ID to use for embeddings |
+
+**Backend comparison:**
+
+| Backend | Startup | PyTorch required | Custom HF models |
+|---|---|---|---|
+| `onnx` (default) | ~100ms | No | ONNX-exported models only |
+| `torch` | ~2–3s | Yes | Any HuggingFace model |
+
+**Examples:**
+
+```bash
+# Use a different model with the torch backend
+VECGREP_BACKEND=torch VECGREP_MODEL=sentence-transformers/all-MiniLM-L6-v2 vecgrep
+
+# Use a custom ONNX model
+VECGREP_MODEL=my-org/my-onnx-model vecgrep
+```
+
 ## Supported languages
 
 Python, JavaScript/TypeScript, Rust, Go, Java, C/C++, Ruby, Swift, Kotlin, C#

@@ -16,6 +16,7 @@ classifiers = [
 requires-python = ">=3.12,<3.13"
 dependencies = [
     "mcp[cli]>=1.0,<2.0",
+    "fastembed>=0.4.0",
     "sentence-transformers>=3.0,<4.0",
     "tree-sitter-languages>=1.10,<2.0",
     "numpy>=1.26",

@@ -1,65 +1,122 @@
-"""Local embedding using sentence-transformers."""
+"""Local embedding with auto-detected backend (fastembed ONNX or sentence-transformers)."""
 
 from __future__ import annotations
 
-import numpy as np
-from sentence_transformers import SentenceTransformer  # type: ignore
+import os
 
-try:
-    import torch
+import numpy as np
 
-    HAS_TORCH = True
-except ImportError:
-    HAS_TORCH = False
+# ---------------------------------------------------------------------------
+# Configuration — override via environment variables
+# ---------------------------------------------------------------------------
 
-# Embedding model: fine-tuned for semantic code search by @isuruwijesiri
-# Citation:
+# Model to use. Default is the fine-tuned code search model.
+# Set VECGREP_MODEL=sentence-transformers/all-MiniLM-L6-v2 (or any HF model)
+# to use a different model.
+#
+# Citation for default model:
 #   isuruwijesiri. (2026). all-MiniLM-L6-v2-code-search-512 [Model].
 #   Hugging Face. https://huggingface.co/isuruwijesiri/all-MiniLM-L6-v2-code-search-512
-MODEL_NAME = "isuruwijesiri/all-MiniLM-L6-v2-code-search-512"
+DEFAULT_MODEL = "isuruwijesiri/all-MiniLM-L6-v2-code-search-512"
+MODEL_NAME = os.environ.get("VECGREP_MODEL", DEFAULT_MODEL)
 
-# Batch sizes tuned per device — GPU/MPS can saturate with larger batches
-_BATCH_SIZE: dict[str, int] = {
+# Backend to use for embedding.
+# - "onnx"  (default) — fastembed + ONNX Runtime, ~100ms startup, no PyTorch needed
+# - "torch" — sentence-transformers + PyTorch, ~2-3s startup, supports any HF model
+BACKEND = os.environ.get("VECGREP_BACKEND", "onnx").lower()
+
+# Batch sizes tuned per device for the torch backend
+_TORCH_BATCH_SIZE: dict[str, int] = {
     "cuda": 256,
     "mps": 256,
     "cpu": 64,
 }
 
-_model = None
-_device: str | None = None
+# ---------------------------------------------------------------------------
+# ONNX backend (fastembed) — default
+# ---------------------------------------------------------------------------
+
+_onnx_model = None
+
+
+def _get_onnx_model():
+    global _onnx_model
+    if _onnx_model is None:
+        from fastembed import TextEmbedding  # type: ignore
+        from fastembed.common.model_description import ModelSource, PoolingType  # type: ignore
+
+        # Register the default model as a custom model so fastembed can fetch
+        # its ONNX files directly from HuggingFace.
+        if MODEL_NAME == DEFAULT_MODEL:
+            TextEmbedding.add_custom_model(
+                model=DEFAULT_MODEL,
+                pooling=PoolingType.MEAN,
+                normalization=True,
+                sources=ModelSource(hf=DEFAULT_MODEL),
+                dim=384,
+                model_file="onnx/model.onnx",
+                description="Fine-tuned MiniLM for semantic code search",
+            )
+
+        _onnx_model = TextEmbedding(MODEL_NAME)
+    return _onnx_model
+
+
+# ---------------------------------------------------------------------------
+# Torch backend (sentence-transformers) — opt-in via VECGREP_BACKEND=torch
+# ---------------------------------------------------------------------------
+
+_torch_model = None
+_torch_device: str | None = None
 
 
 def _detect_device() -> str:
     """Return the best available compute device: cuda > mps > cpu."""
-    if HAS_TORCH:
+    try:
+        import torch
+
         if torch.cuda.is_available():
             return "cuda"
         if torch.backends.mps.is_available():
             return "mps"
+    except ImportError:
+        pass
     return "cpu"
 
 
-def _get_model():
-    global _model, _device
-    if _model is None:
-        _device = _detect_device()
-        _model = SentenceTransformer(MODEL_NAME, device=_device)
-    return _model
+def _get_torch_model():
+    global _torch_model, _torch_device
+    if _torch_model is None:
+        from sentence_transformers import SentenceTransformer  # type: ignore
 
+        _torch_device = _detect_device()
+        _torch_model = SentenceTransformer(MODEL_NAME, device=_torch_device)
+    return _torch_model
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
 
 def embed(texts: list[str]) -> np.ndarray:
-    """Embed a list of texts, returning a float32 array of shape (N, 384)."""
+    """Embed a list of texts, returning a normalised float32 array of shape (N, 384)."""
     if not texts:
         return np.empty((0, 384), dtype=np.float32)
-    model = _get_model()
-    batch_size = _BATCH_SIZE.get(_device or "cpu", 64)
-    vecs = model.encode(
-        texts,
-        batch_size=batch_size,
-        show_progress_bar=False,
-        convert_to_numpy=True,
-    )
-    # Normalize for cosine similarity via dot product
+
+    if BACKEND == "torch":
+        model = _get_torch_model()
+        batch_size = _TORCH_BATCH_SIZE.get(_torch_device or "cpu", 64)
+        vecs = model.encode(
+            texts,
+            batch_size=batch_size,
+            show_progress_bar=False,
+            convert_to_numpy=True,
+        )
+    else:
+        model = _get_onnx_model()
+        vecs = np.array(list(model.embed(texts)))
+
+    # Normalise for cosine similarity via dot product
     norms = np.linalg.norm(vecs, axis=1, keepdims=True)
     norms = np.where(norms == 0, 1.0, norms)
     return (vecs / norms).astype(np.float32)
@@ -43,21 +43,32 @@ def test_same_input_produces_same_output(self):
         np.testing.assert_array_equal(v1, v2)
 
 
+class TestTorchBackend:
+    def test_torch_backend_produces_correct_shape(self):
+        with patch("vecgrep.embedder.BACKEND", "torch"):
+            vecs = embed(["def foo(): pass"])
+        assert vecs.shape == (1, 384)
+        assert vecs.dtype == np.float32
+
+    def test_torch_backend_vectors_are_unit_norm(self):
+        with patch("vecgrep.embedder.BACKEND", "torch"):
+            vecs = embed(["alpha", "beta"])
+        norms = np.linalg.norm(vecs, axis=1)
+        np.testing.assert_allclose(norms, 1.0, atol=1e-5)
+
+
 class TestDetectDevice:
     def test_returns_cuda_when_available(self):
-        with patch("vecgrep.embedder.HAS_TORCH", True), \
-             patch("vecgrep.embedder.torch") as mock_torch:
-            mock_torch.cuda.is_available.return_value = True
-            mock_torch.backends.mps.is_available.return_value = False
+        with patch("torch.cuda.is_available", return_value=True), \
+             patch("torch.backends.mps.is_available", return_value=False):
             assert _detect_device() == "cuda"
 
     def test_returns_mps_when_cuda_unavailable(self):
-        with patch("vecgrep.embedder.HAS_TORCH", True), \
-             patch("vecgrep.embedder.torch") as mock_torch:
-            mock_torch.cuda.is_available.return_value = False
-            mock_torch.backends.mps.is_available.return_value = True
+        with patch("torch.cuda.is_available", return_value=False), \
+             patch("torch.backends.mps.is_available", return_value=True):
             assert _detect_device() == "mps"
 
-    def test_returns_cpu_when_torch_unavailable(self):
-        with patch("vecgrep.embedder.HAS_TORCH", False):
+    def test_returns_cpu_when_both_unavailable(self):
+        with patch("torch.cuda.is_available", return_value=False), \
+             patch("torch.backends.mps.is_available", return_value=False):
             assert _detect_device() == "cpu"