Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Instead of grepping 50 files and sending 30,000 tokens to Claude, VecGrep return
## How it works

1. **Chunk** — Parses source files with tree-sitter to extract semantic units (functions, classes, methods)
2. **Embed** — Encodes each chunk locally using [`all-MiniLM-L6-v2-code-search-512`](https://huggingface.co/isuruwijesiri/all-MiniLM-L6-v2-code-search-512) (384-dim, ~80MB one-time download), automatically using Metal (Apple Silicon), CUDA (NVIDIA), or CPU
2. **Embed** — Encodes each chunk locally using [`all-MiniLM-L6-v2-code-search-512`](https://huggingface.co/isuruwijesiri/all-MiniLM-L6-v2-code-search-512) (384-dim, ~80MB one-time download) via the fastembed ONNX backend (~100ms startup) or PyTorch, automatically using Metal (Apple Silicon), CUDA (NVIDIA), or CPU
3. **Store** — Saves embeddings + metadata in LanceDB under `~/.vecgrep/<project_hash>/`
4. **Search** — ANN index (IVF-PQ) for fast approximate search on large codebases

Expand Down Expand Up @@ -106,6 +106,32 @@ Index status for: /path/to/myproject
Index size: 28.4 MB
```

## Configuration

VecGrep can be tuned via environment variables:

| Variable | Default | Description |
|---|---|---|
| `VECGREP_BACKEND` | `onnx` | Embedding backend: `onnx` (fastembed, fast startup) or `torch` (sentence-transformers, any HF model) |
| `VECGREP_MODEL` | `isuruwijesiri/all-MiniLM-L6-v2-code-search-512` | HuggingFace model ID to use for embeddings |

**Backend comparison:**

| Backend | Startup | PyTorch required | Custom HF models |
|---|---|---|---|
| `onnx` (default) | ~100ms | No | ONNX-exported models only |
| `torch` | ~2–3s | Yes | Any HuggingFace model |

**Examples:**

```bash
# Use a different model with the torch backend
VECGREP_BACKEND=torch VECGREP_MODEL=sentence-transformers/all-MiniLM-L6-v2 vecgrep

# Use a custom ONNX model
VECGREP_MODEL=my-org/my-onnx-model vecgrep
```

## Supported languages

Python, JavaScript/TypeScript, Rust, Go, Java, C/C++, Ruby, Swift, Kotlin, C#
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ classifiers = [
requires-python = ">=3.12,<3.13"
dependencies = [
"mcp[cli]>=1.0,<2.0",
"fastembed>=0.4.0",
"sentence-transformers>=3.0,<4.0",
"tree-sitter-languages>=1.10,<2.0",
"numpy>=1.26",
Expand Down
121 changes: 89 additions & 32 deletions src/vecgrep/embedder.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,122 @@
"""Local embedding using sentence-transformers."""
"""Local embedding with auto-detected backend (fastembed ONNX or sentence-transformers)."""

from __future__ import annotations

import numpy as np
from sentence_transformers import SentenceTransformer # type: ignore
import os

try:
import torch
import numpy as np

HAS_TORCH = True
except ImportError:
HAS_TORCH = False
# ---------------------------------------------------------------------------
# Configuration — override via environment variables
# ---------------------------------------------------------------------------

# Embedding model: fine-tuned for semantic code search by @isuruwijesiri
# Citation:
# Model to use. Default is the fine-tuned code search model.
# Set VECGREP_MODEL=sentence-transformers/all-MiniLM-L6-v2 (or any HF model)
# to use a different model.
#
# Citation for default model:
# isuruwijesiri. (2026). all-MiniLM-L6-v2-code-search-512 [Model].
# Hugging Face. https://huggingface.co/isuruwijesiri/all-MiniLM-L6-v2-code-search-512
MODEL_NAME = "isuruwijesiri/all-MiniLM-L6-v2-code-search-512"
DEFAULT_MODEL = "isuruwijesiri/all-MiniLM-L6-v2-code-search-512"
MODEL_NAME = os.environ.get("VECGREP_MODEL", DEFAULT_MODEL)

# Batch sizes tuned per device — GPU/MPS can saturate with larger batches
_BATCH_SIZE: dict[str, int] = {
# Backend to use for embedding.
# - "onnx" (default) — fastembed + ONNX Runtime, ~100ms startup, no PyTorch needed
# - "torch" — sentence-transformers + PyTorch, ~2-3s startup, supports any HF model
BACKEND = os.environ.get("VECGREP_BACKEND", "onnx").lower()

# Batch sizes tuned per device for the torch backend
_TORCH_BATCH_SIZE: dict[str, int] = {
"cuda": 256,
"mps": 256,
"cpu": 64,
}

_model = None
_device: str | None = None
# ---------------------------------------------------------------------------
# ONNX backend (fastembed) — default
# ---------------------------------------------------------------------------

_onnx_model = None


def _get_onnx_model():
global _onnx_model
if _onnx_model is None:
from fastembed import TextEmbedding # type: ignore
from fastembed.common.model_description import ModelSource, PoolingType # type: ignore

# Register the default model as a custom model so fastembed can fetch
# its ONNX files directly from HuggingFace.
if MODEL_NAME == DEFAULT_MODEL:
TextEmbedding.add_custom_model(
model=DEFAULT_MODEL,
pooling=PoolingType.MEAN,
normalization=True,
sources=ModelSource(hf=DEFAULT_MODEL),
dim=384,
model_file="onnx/model.onnx",
description="Fine-tuned MiniLM for semantic code search",
)

_onnx_model = TextEmbedding(MODEL_NAME)
return _onnx_model


# ---------------------------------------------------------------------------
# Torch backend (sentence-transformers) — opt-in via VECGREP_BACKEND=torch
# ---------------------------------------------------------------------------

_torch_model = None
_torch_device: str | None = None


def _detect_device() -> str:
"""Return the best available compute device: cuda > mps > cpu."""
if HAS_TORCH:
try:
import torch

if torch.cuda.is_available():
return "cuda"
if torch.backends.mps.is_available():
return "mps"
except ImportError:
pass
return "cpu"


def _get_model():
global _model, _device
if _model is None:
_device = _detect_device()
_model = SentenceTransformer(MODEL_NAME, device=_device)
return _model
def _get_torch_model():
global _torch_model, _torch_device
if _torch_model is None:
from sentence_transformers import SentenceTransformer # type: ignore

_torch_device = _detect_device()
_torch_model = SentenceTransformer(MODEL_NAME, device=_torch_device)
return _torch_model


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def embed(texts: list[str]) -> np.ndarray:
"""Embed a list of texts, returning a float32 array of shape (N, 384)."""
"""Embed a list of texts, returning a normalised float32 array of shape (N, 384)."""
if not texts:
return np.empty((0, 384), dtype=np.float32)
model = _get_model()
batch_size = _BATCH_SIZE.get(_device or "cpu", 64)
vecs = model.encode(
texts,
batch_size=batch_size,
show_progress_bar=False,
convert_to_numpy=True,
)
# Normalize for cosine similarity via dot product

if BACKEND == "torch":
model = _get_torch_model()
batch_size = _TORCH_BATCH_SIZE.get(_torch_device or "cpu", 64)
vecs = model.encode(
texts,
batch_size=batch_size,
show_progress_bar=False,
convert_to_numpy=True,
)
else:
model = _get_onnx_model()
vecs = np.array(list(model.embed(texts)))

# Normalise for cosine similarity via dot product
norms = np.linalg.norm(vecs, axis=1, keepdims=True)
norms = np.where(norms == 0, 1.0, norms)
return (vecs / norms).astype(np.float32)
31 changes: 21 additions & 10 deletions tests/test_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,32 @@ def test_same_input_produces_same_output(self):
np.testing.assert_array_equal(v1, v2)


class TestTorchBackend:
def test_torch_backend_produces_correct_shape(self):
with patch("vecgrep.embedder.BACKEND", "torch"):
vecs = embed(["def foo(): pass"])
assert vecs.shape == (1, 384)
assert vecs.dtype == np.float32

def test_torch_backend_vectors_are_unit_norm(self):
with patch("vecgrep.embedder.BACKEND", "torch"):
vecs = embed(["alpha", "beta"])
norms = np.linalg.norm(vecs, axis=1)
np.testing.assert_allclose(norms, 1.0, atol=1e-5)


class TestDetectDevice:
def test_returns_cuda_when_available(self):
with patch("vecgrep.embedder.HAS_TORCH", True), \
patch("vecgrep.embedder.torch") as mock_torch:
mock_torch.cuda.is_available.return_value = True
mock_torch.backends.mps.is_available.return_value = False
with patch("torch.cuda.is_available", return_value=True), \
patch("torch.backends.mps.is_available", return_value=False):
assert _detect_device() == "cuda"

def test_returns_mps_when_cuda_unavailable(self):
with patch("vecgrep.embedder.HAS_TORCH", True), \
patch("vecgrep.embedder.torch") as mock_torch:
mock_torch.cuda.is_available.return_value = False
mock_torch.backends.mps.is_available.return_value = True
with patch("torch.cuda.is_available", return_value=False), \
patch("torch.backends.mps.is_available", return_value=True):
assert _detect_device() == "mps"

def test_returns_cpu_when_torch_unavailable(self):
with patch("vecgrep.embedder.HAS_TORCH", False):
def test_returns_cpu_when_both_unavailable(self):
with patch("torch.cuda.is_available", return_value=False), \
patch("torch.backends.mps.is_available", return_value=False):
assert _detect_device() == "cpu"
Loading