diff --git a/docs/data.md b/docs/data.md index ad3e3b4649..65651a43b1 100644 --- a/docs/data.md +++ b/docs/data.md @@ -39,7 +39,7 @@ for revision in phabricator.get_revisions(): ```py from bugbug import repository, db -db.download(bugzilla.COMMITS_DB) +db.download(repository.COMMITS_DB) for commit in repository.get_commits(): print(commit["node"]) diff --git a/requirements.txt b/requirements.txt index de5bfffb2f..08ebd530b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,6 +38,7 @@ shap[plots]==0.48.0 tabulate==0.9.0 taskcluster==86.0.1 tenacity==9.1.2 +tiktoken~=0.9.0 tqdm==4.67.1 unidiff==0.7.5 xgboost==2.1.4 diff --git a/scripts/comment_generation_bug_reports.py b/scripts/comment_generation_bug_reports.py new file mode 100644 index 0000000000..9831d67802 --- /dev/null +++ b/scripts/comment_generation_bug_reports.py @@ -0,0 +1,704 @@ +import csv +import json +import os +import subprocess +from datetime import datetime, timedelta + +import tiktoken +from dateutil import parser, tz +from langchain.chains.conversation.base import ConversationChain +from langchain.chains.llm import LLMChain +from langchain.memory import ConversationBufferMemory +from langchain_core.prompts import PromptTemplate +from langchain_openai import ChatOpenAI +from unidiff import PatchSet + +import bugbug.bugzilla as bugzilla + +csv.field_size_limit(10**8) + +### VARIABLES + +# For the input file, we consider the list of reported bugs available here: https://github.com/mozilla/regressors-regressions-dataset +# Clone the repository locally and inform the path to the file dataset.csv +INPUT_FILE = "" + +# As we need to access some additional information associated with the commits reported in the INPUT_FILE, we need to have locally the central repository. +# For that, you might clone the repo locally and inform its path below: https://hg-edge.mozilla.org/mozilla-central +LOCAL_MERCURIAL_PATH = "" +REPORT_DIRECTORY = "" +REPORT_FILENAME_GPT = "filtered_comment_gpt.csv" +REPORT_FILENAME_DEEPSEEK = "filtered_comment_deepseek.csv" + +OPEN_API_KEY = "" +OPEN_AI_MODEL = "gpt-4o-mini" +DEEPSEEK_API = "" +DEEPSEEK_MODEL_NAME = "deepseek-chat" +TEMPERATURE = 0.2 + +### PROMPTS + +CODE_SUMMARIZATION_DIFF = """ +You are an expert reviewer for source code with extensive experience in analyzing and summarizing code changes. + +The bug associated with patch_bug was introduced and later fixed. Below, you can find further information about the fix. +Fix title: {fix_title} +Fix description: {fix_description} + +Your task: +Analyze the provided code and generate a concise summary focusing on the exact changes in patch_bug that introduced the issue and how patch_fix resolved it. Ignore any modifications unrelated to the bug fix. + +You must report: +1. The root cause of the issue in `patch_bug`: Identify the specific code lines in patch_bug responsible for the bug. Report the exact affected line and explain why they led to the issue. One single line number for change. +2. The specific changes in `patch_fix` that correct the issue: Explain how the bug was resolved, but keep the focus on mapping fixes back to the faulty lines in `patch_bug`. + +Output Format: +Provide a structured response that explicitly maps faulty lines in `patch_bug` to the fix in `patch_fix`, like this: + +{{ + "root_cause": {{ + "filename": "", + "line": [], + "explanation": "" + }}, + "fix": {{ + "filename": "", + "line": [], + "explanation": "" + }} +}} + +Bug commit message: {bug_commit_message} +{patch_bug} + +Fix commit message: {fix_commit_message} +{patch_fix} + """ + +FILTERING_COMMENTS = """ +You are an expert reviewer with extensive experience in source code reviews. + +Please analyze the comments below and filter out any comments that are not related to the changes applied in the commit diff. + +Apply the following filters: +1. Remove comments that focus on documentation, comments, error handling, or requests for tests. +2. Remove comments that suggest developers to double-check or ensure their implementations (e.g., verifying the existence, initialization, or creation of objects, methods, or files) without providing actionable feedback. +3. Remove comments that are purely descriptive and do not suggest improvements or highlight problems. +4. Remove comments that are solely praising (e.g., "This is a good addition to the code."). +5. Consolidate duplicate comments that address the same issue into a single, comprehensive comment. +6. Do not change the contents of the comments. + +Output: +Return a single JSON file containing the valid comments, and no additional content. +Ensure the output format matches the example below: + +Example: +```json +[ + {{ + "filename": "netwerk/streamconv/converters/mozTXTToHTMLConv.cpp", + "start_line": 1211, + "content": "Ensure that the size of `tempString` does not exceed 256 characters. Using `nsAutoStringN<256>` is efficient for small strings, but exceeding the size can lead to buffer issues.", + "label": "code validation" + "label_justification": "Functional - Validation" + }} +] + +Below, you can find the comments: +{comments} + +And now, you can find the commit diff: +{bug_summarization} +""" + +CODE_GEN_BUG_FIX = """ +Now, you're asked to generate code review comments for `patch_bug`, aiming to avoid the occurrence of the reported bug. + +### Guidelines: +1. **Objective**: Identify changes in `patch_bug` that introduced the bug and provide actionable feedback to prevent it. +2. **Reference**: Use `bug_summarization` to understand the bug’s cause, but ensure that all comments apply strictly to `patch_bug`. +3. **Exclusions**: + - Do **not** comment on changes that appear only in `bug_summarization` but were not present in `patch_bug`. + - Do **not** suggest fixes based on changes made in `bug_summarization`. The goal is to improve `patch_bug` to prevent the issue from occurring. +4. **Context**: Align your review with the issues raised in `bug_summarization` and Mozilla's source code guidelines. +5. **Format**: Write comments in the following JSON format, considering the `patch_bug` information: + + ```json + [ + {{ + \"filename\": \"\", + \"start_line\": , + \"content\": \"\", + \"label\": \"