diff --git a/bugbug/repository.py b/bugbug/repository.py index 341115b1c7..07d58c0031 100644 --- a/bugbug/repository.py +++ b/bugbug/repository.py @@ -1543,6 +1543,69 @@ def trigger_pull() -> None: trigger_pull() +def get_diff(repo_path, original_hash, fix_hash) -> bytes: + client = hglib.open(repo_path) + + current_rev = client.identify(id=True) + + try: + client.rawcommand([b"shelve"]) + except hglib.error.CommandError as e: + if b"nothing changed" in e.out: + logger.info(f"Nothing to shelve: {e}") + else: + raise RuntimeError("Error occurred while shelving") from e + + parents = client.parents(rev=fix_hash) + parent_of_fix = parents[0][1] + client.update(rev=parent_of_fix, clean=True) + + graft_result = graft( + client, revs=[original_hash], no_commit=True, force=True, tool=":merge" + ) + + if not graft_result: + return b"" + + final_diff = client.diff( + revs=[fix_hash], ignoreallspace=True, ignorespacechange=True, reverse=True + ) + + client.update(rev=current_rev, clean=True) + + return final_diff + + +def graft(client, revs, no_commit=False, force=False, tool=":merge") -> bool: + """Graft changesets specified by revs into the current repository state. + + Args: + client: The hglib client. + revs: A list of the hashes of the commits to be applied to the current repository state. + no_commit: If True, does not commit and just applies changes in working directory. + force: If True, forces the grafts even if the revs are ancestors of the current repository state. + tool: A string representing a merge tool (see `hg help merge-tools`). + + Returns: + Boolean of graft operation result (True for success, False for failure). + """ + args = hglib.util.cmdbuilder( + str.encode("graft"), r=revs, no_commit=no_commit, f=force, tool=tool + ) + + eh = hglib.util.reterrorhandler(args) + + client.rawcommand(args, eh=eh, prompt=auto_resolve_conflict_prompt) + + return True + + +def auto_resolve_conflict_prompt(max_bytes, current_output): + if b"was deleted in" in current_output: + return b"c\n" # Return 'c' to use the changed version + return b"\n" # Default to doing nothing, just proceed + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("repository_dir", help="Path to the repository", action="store") diff --git a/scripts/backout_data_collection.py b/scripts/backout_data_collection.py new file mode 100644 index 0000000000..3acd43106a --- /dev/null +++ b/scripts/backout_data_collection.py @@ -0,0 +1,223 @@ +import json +import logging +import os +from collections.abc import Generator +from datetime import datetime, timedelta + +from tqdm import tqdm + +from bugbug import bugzilla, db, repository + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def download_databases() -> None: + logger.info("Cloning Mercurial database...") + repository.clone(repo_dir="hg_dir") + + logger.info("Downloading bugs database...") + assert db.download(bugzilla.BUGS_DB) + + logger.info("Downloading commits database...") + assert db.download(repository.COMMITS_DB, support_files_too=True) + + +def preprocess_commits_and_bugs() -> tuple[dict, dict]: + logger.info("Preprocessing commits and bugs...") + bug_resolution_map = {} + bug_to_commit_dict: dict[int, list] = {} + + for commit in repository.get_commits( + include_no_bug=True, include_backouts=True, include_ignored=True + ): + commit_data = { + key: commit[key] + for key in ["node", "bug_id", "pushdate", "backedoutby", "backsout"] + } + + bug_to_commit_dict.setdefault(commit["bug_id"], []).append(commit_data) + + # We only require the bug's resolution (to check if it is 'FIXED'). + bug_resolution_map = { + bug["id"]: bug["resolution"] for bug in bugzilla.get_bugs(include_invalid=True) + } + + return bug_to_commit_dict, bug_resolution_map + + +def has_conflicts(diff: str) -> bool: + """Return True if the diff contains any conflict markers. Used with merge-tool ':fail'.""" + conflict_markers = ["<<<<<<<", "=======", ">>>>>>>"] + return any(marker in diff for marker in conflict_markers) + + +def generate_datapoints( + commit_limit: int, + bug_to_commit_dict: dict, + bug_resolution_map: dict, + repo_dir: str, +) -> Generator[dict, None, None]: + counter = 0 + commit_limit = min(commit_limit, 709458) + + logger.info("Generating datapoints...") + + for commit in tqdm( + repository.get_commits( + include_no_bug=True, include_backouts=True, include_ignored=True + ) + ): + counter += 1 + + bug_resolution = bug_resolution_map.get(commit["bug_id"]) + + pushdate = datetime.strptime(commit["pushdate"], "%Y-%m-%d %H:%M:%S") + + if (datetime.now() - pushdate) > timedelta(days=730): + continue + + if not commit["backedoutby"] or bug_resolution != "FIXED": + continue + + # We only add the commit if it has been backed out and the bug it is for is FIXED. + fixing_commit, non_backed_out_commits = find_next_commit( + commit["bug_id"], + bug_to_commit_dict, + commit["node"], + commit["backedoutby"], + ) + + if not fixing_commit or non_backed_out_commits > 1: + continue + + commit_diff = repository.get_diff( + repo_dir, commit["node"], fixing_commit["node"] + ) + + if not commit_diff: + continue + + commit_diff_encoded = commit_diff.decode("utf-8") + + if has_conflicts(commit_diff_encoded): + continue + + yield { + "non_backed_out_commits": non_backed_out_commits, + "fix_found": True, + "bug_id": commit["bug_id"], + "inducing_commit": commit["node"], + "backout_commit": commit["backedoutby"], + "fixing_commit": fixing_commit["node"], + "commit_diff": commit_diff_encoded, + } + + if counter >= commit_limit: + break + + +def find_next_commit( + bug_id: int, bug_to_commit_dict: dict, inducing_node: str, backout_node: str +) -> tuple[dict, int]: + backout_commit_found = False + fixing_commit = None + + non_backed_out_counter = 0 + + for commit in bug_to_commit_dict[bug_id]: + # If the backout commit has been found in the bug's commit history, + # find the next commit that has not been backed out or backs out other commits. + if backout_commit_found: + if ( + not commit["backedoutby"] + and not fixing_commit + and not commit["backsout"] + ): + fixing_commit = commit + non_backed_out_counter += 1 + elif not commit["backedoutby"]: + non_backed_out_counter += 1 + + if commit["node"] == backout_node: + backout_commit_found = True + + if ( + not fixing_commit + or fixing_commit["node"] == inducing_node + or fixing_commit["node"] == backout_node + ): + return {}, non_backed_out_counter + + return fixing_commit, non_backed_out_counter + + +def save_datasets( + directory_path: str, dataset_filename: str, data_generator, batch_size: int = 10 +) -> None: + os.makedirs(directory_path, exist_ok=True) + logger.info(f"Directory {directory_path} created") + + dataset_filepath = os.path.join(directory_path, dataset_filename) + + fix_found_counter = 0 + fix_batch = [] + + with open(dataset_filepath, "w") as file: + file.write("[\n") + first = True + + logger.info("Populating dataset...") + for item in data_generator: + item.pop("fix_found", None) + fix_batch.append(item) + fix_found_counter += 1 + + if len(fix_batch) >= batch_size: + if not first: + file.write(",\n") + else: + first = False + + json_data = ",\n".join(json.dumps(i, indent=4) for i in fix_batch) + file.write(json_data) + file.flush() + os.fsync(file.fileno()) + fix_batch = [] + + if fix_batch: + if not first: + file.write(",\n") + json_data = ",\n".join(json.dumps(i, indent=4) for i in fix_batch) + file.write(json_data) + file.flush() + os.fsync(file.fileno()) + + file.write("\n]") + + logger.info(f"Dataset successfully saved to {dataset_filepath}") + logger.info(f"Number of commits with fix found saved: {fix_found_counter}") + + +def main(): + download_databases() + + bug_to_commit_dict, bug_resolution_map = preprocess_commits_and_bugs() + + data_generator = generate_datapoints( + commit_limit=1000000, + bug_to_commit_dict=bug_to_commit_dict, + bug_resolution_map=bug_resolution_map, + repo_dir="hg_dir", + ) + + save_datasets( + directory_path="dataset", + dataset_filename="backout_dataset.json", + data_generator=data_generator, + batch_size=1, + ) + + +if __name__ == "__main__": + main()