From b53a07ff423e216936e5359d0479c7803417b227 Mon Sep 17 00:00:00 2001 From: ArezouAmini Date: Wed, 11 Jun 2025 00:48:36 -0400 Subject: [PATCH 1/4] [code_review] Integrate ML filter for the generated comments --- bugbug/ml_filter_finetune_tool.py | 98 +++++++++++++++++++++++++++++++ bugbug/ml_filter_tool.py | 17 ++++++ bugbug/ml_filter_trainer_tool.py | 45 ++++++++++++++ bugbug/tools/code_review.py | 15 ++++- 4 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 bugbug/ml_filter_finetune_tool.py create mode 100644 bugbug/ml_filter_tool.py create mode 100644 bugbug/ml_filter_trainer_tool.py diff --git a/bugbug/ml_filter_finetune_tool.py b/bugbug/ml_filter_finetune_tool.py new file mode 100644 index 0000000000..cec021a621 --- /dev/null +++ b/bugbug/ml_filter_finetune_tool.py @@ -0,0 +1,98 @@ +from abc import ABC, abstractmethod +from pathlib import Path + +import torch + +from datasets import Dataset +from torch.nn.functional import softmax +from transformers import ( + AutoTokenizer, + ModernBertForSequenceClassification, + set_seed, + Trainer, + TrainingArguments, +) + + +class FineTuneMLClassifer(ABC): + def __init__(self, model_path, seed=42): + self.model = ModernBertForSequenceClassification.from_pretrained( + model_path, device_map=self.device, attn_implementation="sdpa" + ) + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, device_map=self.device + ) + self.seed = seed + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + def _tokenize(self, batch): + return self.tokenizer( + batch["comment"], + padding=True, + truncation=True, + return_tensors="pt", + ) + + def fit(self, inputs, labels): + set_seed(self.seed) + + train_dataset = Dataset.from_dict( + { + "comment": inputs, + "label": labels, + } + ) + + train_dataset = train_dataset.map( + self._tokenize, batched=True, remove_columns=["comment"] + ) + + training_args = TrainingArguments( + # Required parameter: + output_dir=None, + # Optional training parameters: + num_train_epochs=30, + per_device_train_batch_size=128, + warmup_steps=500, + learning_rate=5e-5, + optim="adamw_torch", + # lr_scheduler_type="constant", + # warmup_ratio=0.1, + bf16=True, + eval_steps=0, + save_strategy="no", + save_steps=100, + save_total_limit=2, + logging_steps=10, + logging_strategy="epoch", + report_to="none", + seed=self.seed, + use_cpu=True if self.device == "cpu" else False, + ) + trainer = Trainer( + model=self.model, + args=training_args, + tokenizer=self.tokenizer, + train_dataset=train_dataset, + eval_dataset=None, + ) + + trainer.train() + self.model.save_pretrained(save_directory=self.tmpdir) + self.tokenizer.save_pretrained(save_directory=self.tmpdir) + + def predict(self, inputs): + self.model.to(self.device).eval() + + inpt = self.tokenizer( + inputs, padding=True, truncation=True, return_tensors="pt" + ).to(self.device) + + with torch.no_grad(): + logits = self.model(**inpt).logits + probs = softmax(logits, dim=1)[:, 0] + probs = probs.detach().cpu().numpy() + return probs + + @abstractmethod + def save(self, tmpdir: Path): ... diff --git a/bugbug/ml_filter_tool.py b/bugbug/ml_filter_tool.py new file mode 100644 index 0000000000..6e02a521bc --- /dev/null +++ b/bugbug/ml_filter_tool.py @@ -0,0 +1,17 @@ +from abc import ABC, abstractmethod +from typing import Any + + +class MLCommentFilter(ABC): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + @abstractmethod + def query_ml_filter(self, comments, *args, **kwargs) -> Any: ... + + +ml_comment_filters = {} + + +def register_ml_comment_filters(name, cls): + ml_comment_filters[name] = cls diff --git a/bugbug/ml_filter_trainer_tool.py b/bugbug/ml_filter_trainer_tool.py new file mode 100644 index 0000000000..278201ec29 --- /dev/null +++ b/bugbug/ml_filter_trainer_tool.py @@ -0,0 +1,45 @@ +from abc import ABC, abstractmethod + +import numpy as np + +from sklearn.metrics import recall_score + + +class Trainer(ABC): + def __init__(self, min_recall: float = 0.9, thr_metric: str = "acceptance_rate"): + self.min_recall = min_recall + self.thr_metric = thr_metric + + @abstractmethod + def train_test_split(self, data, test_size=0.5, random_split=True): ... + + def fit(self, model): + model.fit(self.train_inputs, self.train_labels) + return model.predict(self.val_inputs) + + def train(self, model): + probs = self.fit(model) + thresholds_results = {} + for thr in np.arange(0, 1.01, 0.01): + preds = np.where(probs >= thr, 0, 1) + recalls = recall_score(self.val_labels, preds, average=None) + acceptance_rate = sum( + [1 for pred, label in zip(preds, self.val_labels) if pred and label] + ) / sum(preds) + thresholds_results[thr] = { + "recall_accept": recalls[1], + "gmean": np.sqrt(recalls[0] * recalls[1]), + "acceptance_rate": acceptance_rate, + } + # Select threshold based on minimum accept recall and max acceptance_rate/gmean + thresholds_results = { + thr: metrics + for thr, metrics in thresholds_results.items() + if metrics["recall_accept"] >= self.min_recall + } + thresholds_results = sorted( + thresholds_results.items(), + key=lambda x: x[1][f"{self.thr_metric}"], + reverse=True, + ) + return thresholds_results[0][0] diff --git a/bugbug/tools/code_review.py b/bugbug/tools/code_review.py index f40a7c8316..643fd3511c 100644 --- a/bugbug/tools/code_review.py +++ b/bugbug/tools/code_review.py @@ -30,6 +30,7 @@ from bugbug import db, phabricator, utils from bugbug.code_search.function_search import FunctionSearch from bugbug.generative_model_tool import GenerativeModelTool, get_tokenizer +from bugbug.ml_filter_tool import MLCommentFilter from bugbug.utils import get_secret from bugbug.vectordb import PayloadScore, QueryFilter, VectorDB, VectorPoint @@ -1125,6 +1126,7 @@ def __init__( verbose: bool = True, suggestions_feedback_db: Optional["SuggestionsFeedbackDB"] = None, target_software: Optional[str] = None, + ml_comment_filter: Optional[MLCommentFilter] = None, ) -> None: super().__init__() @@ -1199,6 +1201,8 @@ def __init__( self.suggestions_feedback_db = suggestions_feedback_db + self.ml_comment_filter = ml_comment_filter + def count_tokens(self, text): return len(self._tokenizer.encode(text)) @@ -1366,7 +1370,16 @@ def run(self, patch: Patch) -> list[InlineComment] | None: if self.verbose: GenerativeModelTool._print_answer(raw_output) - return list(generate_processed_output(raw_output, patch.patch_set)) + generated_inline_comments = list( + generate_processed_output(raw_output, patch.patch_set) + ) + + if self.ml_comment_filter: + generated_inline_comments = self.ml_comment_filter.query_ml_filter( + generated_inline_comments + ) + + return generated_inline_comments def _get_generated_examples(self, patch, created_before: datetime | None = None): """Get examples of comments that were generated by an LLM. From 71d4c52346c579c718d90d7c006c3bbc678bce22 Mon Sep 17 00:00:00 2001 From: ArezouAmini Date: Wed, 11 Jun 2025 01:11:27 -0400 Subject: [PATCH 2/4] Lint changes --- bugbug/ml_filter_finetune_tool.py | 2 +- bugbug/ml_filter_trainer_tool.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bugbug/ml_filter_finetune_tool.py b/bugbug/ml_filter_finetune_tool.py index cec021a621..652490f311 100644 --- a/bugbug/ml_filter_finetune_tool.py +++ b/bugbug/ml_filter_finetune_tool.py @@ -8,9 +8,9 @@ from transformers import ( AutoTokenizer, ModernBertForSequenceClassification, - set_seed, Trainer, TrainingArguments, + set_seed, ) diff --git a/bugbug/ml_filter_trainer_tool.py b/bugbug/ml_filter_trainer_tool.py index 278201ec29..2e4ae820e1 100644 --- a/bugbug/ml_filter_trainer_tool.py +++ b/bugbug/ml_filter_trainer_tool.py @@ -1,7 +1,6 @@ from abc import ABC, abstractmethod import numpy as np - from sklearn.metrics import recall_score From d6bb64daa10d9e39a18a77f3e832befa75c8b9e3 Mon Sep 17 00:00:00 2001 From: ArezouAmini Date: Wed, 11 Jun 2025 01:23:45 -0400 Subject: [PATCH 3/4] Fix lint errors --- bugbug/ml_filter_finetune_tool.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bugbug/ml_filter_finetune_tool.py b/bugbug/ml_filter_finetune_tool.py index 652490f311..4381aeba02 100644 --- a/bugbug/ml_filter_finetune_tool.py +++ b/bugbug/ml_filter_finetune_tool.py @@ -2,7 +2,6 @@ from pathlib import Path import torch - from datasets import Dataset from torch.nn.functional import softmax from transformers import ( @@ -84,12 +83,12 @@ def fit(self, inputs, labels): def predict(self, inputs): self.model.to(self.device).eval() - inpt = self.tokenizer( + input = self.tokenizer( inputs, padding=True, truncation=True, return_tensors="pt" ).to(self.device) with torch.no_grad(): - logits = self.model(**inpt).logits + logits = self.model(**input).logits probs = softmax(logits, dim=1)[:, 0] probs = probs.detach().cpu().numpy() return probs From 3d8a24f10b1056e7481b5956d2ba0f0b3d9b3bc0 Mon Sep 17 00:00:00 2001 From: ArezouAmini Date: Wed, 11 Jun 2025 13:21:41 -0400 Subject: [PATCH 4/4] Modify model saving --- bugbug/ml_filter_finetune_tool.py | 6 +++--- bugbug/ml_filter_trainer_tool.py | 15 +++++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/bugbug/ml_filter_finetune_tool.py b/bugbug/ml_filter_finetune_tool.py index 4381aeba02..9341a6241b 100644 --- a/bugbug/ml_filter_finetune_tool.py +++ b/bugbug/ml_filter_finetune_tool.py @@ -32,7 +32,7 @@ def _tokenize(self, batch): return_tensors="pt", ) - def fit(self, inputs, labels): + def fit(self, inputs, labels, tmpdir): set_seed(self.seed) train_dataset = Dataset.from_dict( @@ -77,8 +77,8 @@ def fit(self, inputs, labels): ) trainer.train() - self.model.save_pretrained(save_directory=self.tmpdir) - self.tokenizer.save_pretrained(save_directory=self.tmpdir) + self.model.save_pretrained(save_directory=tmpdir) + self.tokenizer.save_pretrained(save_directory=tmpdir) def predict(self, inputs): self.model.to(self.device).eval() diff --git a/bugbug/ml_filter_trainer_tool.py b/bugbug/ml_filter_trainer_tool.py index 2e4ae820e1..64ae84349a 100644 --- a/bugbug/ml_filter_trainer_tool.py +++ b/bugbug/ml_filter_trainer_tool.py @@ -1,23 +1,30 @@ from abc import ABC, abstractmethod +from pathlib import Path import numpy as np from sklearn.metrics import recall_score class Trainer(ABC): - def __init__(self, min_recall: float = 0.9, thr_metric: str = "acceptance_rate"): + def __init__( + self, + min_recall: float = 0.9, + thr_metric: str = "acceptance_rate", + tmpdir: Path = Path(""), + ): self.min_recall = min_recall self.thr_metric = thr_metric + self.tmpdir = tmpdir @abstractmethod def train_test_split(self, data, test_size=0.5, random_split=True): ... - def fit(self, model): - model.fit(self.train_inputs, self.train_labels) + def _fit(self, model): + model.fit(self.train_inputs, self.train_labels, self.tmpdir) return model.predict(self.val_inputs) def train(self, model): - probs = self.fit(model) + probs = self._fit(model) thresholds_results = {} for thr in np.arange(0, 1.01, 0.01): preds = np.where(probs >= thr, 0, 1)