From e9137cc75ca8df29b1dd99cb599cd6bfc394cf0a Mon Sep 17 00:00:00 2001 From: leusonmario Date: Tue, 8 Jul 2025 23:28:54 -0400 Subject: [PATCH 1/3] Generating comments from past bugs --- requirements.txt | 1 + scripts/comment_generation_bug_reports.py | 705 ++++++++++++++++++++++ 2 files changed, 706 insertions(+) create mode 100644 scripts/comment_generation_bug_reports.py diff --git a/requirements.txt b/requirements.txt index de5bfffb2f..08ebd530b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,6 +38,7 @@ shap[plots]==0.48.0 tabulate==0.9.0 taskcluster==86.0.1 tenacity==9.1.2 +tiktoken~=0.9.0 tqdm==4.67.1 unidiff==0.7.5 xgboost==2.1.4 diff --git a/scripts/comment_generation_bug_reports.py b/scripts/comment_generation_bug_reports.py new file mode 100644 index 0000000000..6260021d92 --- /dev/null +++ b/scripts/comment_generation_bug_reports.py @@ -0,0 +1,705 @@ +import csv +import json +import os +import subprocess +from datetime import datetime, timedelta + +import tiktoken +from dateutil import parser, tz +from langchain.chains.conversation.base import ConversationChain +from langchain.chains.llm import LLMChain +from langchain.memory import ConversationBufferMemory +from langchain_core.prompts import PromptTemplate +from langchain_openai import ChatOpenAI +from unidiff import PatchSet + +import bugbug.bugzilla as bugzilla + +csv.field_size_limit(10**8) + +### VARIABLES + +# For the input file, we consider the list of reported bugs available here: https://github.com/mozilla/regressors-regressions-dataset +# Clone the repository locally and inform the path to the file dataset.csv +INPUT_FILE = "" + +# As we need to access some additional information associated with the commits reported in the INPUT_FILE, we need to have locally the central repository. +# For that, you might clone the repo locally and inform its path below: https://hg-edge.mozilla.org/mozilla-central +LOCAL_MERCURIAL_PATH = "" +REPORT_DIRECTORY = "" +REPORT_FILENAME_GPT = "filtered_comment_gpt.csv" +REPORT_FILENAME_DEEPSEEK = "filtered_comment_deepseek.csv" + +OPEN_API_KEY = "" +OPEN_AI_MODEL = "gpt-4o-mini" +DEEPSEEK_API = "" +DEEPSEEK_MODEL_NAME = "deepseek-chat" +TEMPERATURE = 0.2 + +### PROMPTS + +CODE_SUMMARIZATION_DIFF = """ + You are an expert reviewer for source code with extensive experience in analyzing and summarizing code changes. + + The bug associated with patch_bug was introduced and later fixed. Below, you can find further information about the fix. + Fix title: {fix_title} + Fix description: {fix_description} + + Your task: + Analyze the provided code and generate a concise summary focusing on the exact changes in patch_bug that introduced the issue and how patch_fix resolved it. Ignore any modifications unrelated to the bug fix. + + You must report: + 1. The root cause of the issue in `patch_bug`: Identify the specific code lines in patch_bug responsible for the bug. Report the exact affected line and explain why they led to the issue. One single line number for change. + 2. The specific changes in `patch_fix` that correct the issue: Explain how the bug was resolved, but keep the focus on mapping fixes back to the faulty lines in `patch_bug`. + + Output Format: + Provide a structured response that explicitly maps faulty lines in `patch_bug` to the fix in `patch_fix`, like this: + + {{ + "root_cause": {{ + "filename": "", + "line": [], + "explanation": "" + }}, + "fix": {{ + "filename": "", + "line": [], + "explanation": "" + }} + }} + + Bug commit message: {bug_commit_message} + {patch_bug} + + Fix commit message: {fix_commit_message} + {patch_fix} + """ + +FILTERING_COMMENTS = """ + You are an expert reviewer with extensive experience in source code reviews. + + Please analyze the comments below and filter out any comments that are not related to the changes applied in the commit diff. + + Apply the following filters: + 1. Remove comments that focus on documentation, comments, error handling, or requests for tests. + 2. Remove comments that suggest developers to double-check or ensure their implementations (e.g., verifying the existence, initialization, or creation of objects, methods, or files) without providing actionable feedback. + 3. Remove comments that are purely descriptive and do not suggest improvements or highlight problems. + 4. Remove comments that are solely praising (e.g., "This is a good addition to the code."). + 5. Consolidate duplicate comments that address the same issue into a single, comprehensive comment. + 6. Do not change the contents of the comments. + + Output: + Return a single JSON file containing the valid comments, and no additional content. + Ensure the output format matches the example below: + + Example: + ```json + [ + {{ + "filename": "netwerk/streamconv/converters/mozTXTToHTMLConv.cpp", + "start_line": 1211, + "content": "Ensure that the size of `tempString` does not exceed 256 characters. Using `nsAutoStringN<256>` is efficient for small strings, but exceeding the size can lead to buffer issues.", + "label": "code validation" + "label_justification": "Functional - Validation" + }} + ] + + Below, you can find the comments: + {comments} + + And now, you can find the commit diff: + {bug_summarization} + """ + +CODE_GEN_BUG_FIX = """ + Now, you're asked to generate code review comments for `patch_bug`, aiming to avoid the occurrence of the reported bug. + + ### Guidelines: + 1. **Objective**: Identify changes in `patch_bug` that introduced the bug and provide actionable feedback to prevent it. + 2. **Reference**: Use `bug_summarization` to understand the bug’s cause, but ensure that all comments apply strictly to `patch_bug`. + 3. **Exclusions**: + - Do **not** comment on changes that appear only in `bug_summarization` but were not present in `patch_bug`. + - Do **not** suggest fixes based on changes made in `bug_summarization`. The goal is to improve `patch_bug` to prevent the issue from occurring. + 4. **Context**: Align your review with the issues raised in `bug_summarization` and Mozilla's source code guidelines. + 5. **Format**: Write comments in the following JSON format, considering the `patch_bug` information: + + ```json + [ + {{ + \"filename\": \"\", + \"start_line\": , + \"content\": \"\", + \"label\": \"