-
Notifications
You must be signed in to change notification settings - Fork 319
Dataset creation for backout commits #4159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 12 commits
eabf9bb
aaf8386
c096468
24046fd
3eb6605
2db5029
3516c09
0544b27
49570ac
fc37940
10314dd
943eb40
8ed0784
39ab450
fe8114b
74939f2
be10d51
3a406ef
bc23a22
6058305
e666c2e
40bbe1b
a4c5bff
f133041
d202b0b
79152a3
4740196
38d6cf8
9fc018c
846210f
ae28dcf
c6f6a8f
37c51b6
fad6df6
fb7a17d
bfc77e4
66108ad
0d83fa7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,172 @@ | ||
| import json | ||
| import logging | ||
| import os | ||
|
|
||
| from tqdm import tqdm | ||
|
|
||
| from bugbug import bugzilla, db, repository | ||
|
|
||
| logging.basicConfig(level=logging.INFO) | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def download_databases() -> None: | ||
| logger.info("Downloading bugs database...") | ||
| assert db.download(bugzilla.BUGS_DB) | ||
|
|
||
| logger.info("Downloading commits database...") | ||
| assert db.download(repository.COMMITS_DB, support_files_too=True) | ||
|
|
||
|
|
||
| def preprocess_commits_and_bugs() -> tuple[dict, dict, dict]: | ||
| logger.info("Preprocessing commits and bugs...") | ||
| commit_dict = {} | ||
| bug_to_commit_dict = {} | ||
|
|
||
| # store commits with their hashes and bug IDs as keys | ||
| for commit in tqdm( | ||
| repository.get_commits( | ||
| include_no_bug=True, include_backouts=True, include_ignored=True | ||
| ), | ||
| desc="Processing commits", | ||
| ): | ||
| commit_dict[commit["node"]] = { | ||
| "node": commit["node"], | ||
| "bug_id": commit["bug_id"], | ||
| "desc": commit["desc"], | ||
| "pushdate": commit["pushdate"], | ||
| "backedoutby": commit["backedoutby"], | ||
| "backsout": commit["backsout"], | ||
| } | ||
|
|
||
| if commit_dict[commit["node"]]["bug_id"] not in bug_to_commit_dict: | ||
suhaibmujahid marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| bug_to_commit_dict[commit["bug_id"]] = [commit_dict[commit["node"]]] | ||
| else: | ||
| bug_to_commit_dict[commit["bug_id"]].append(commit_dict[commit["node"]]) | ||
|
|
||
| logger.info("Preprocessing bugs") | ||
| bug_dict = {} | ||
|
|
||
| num_lines = sum(1 for line in open(bugzilla.BUGS_DB, "r")) | ||
suhaibmujahid marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # store bugs with their bug IDs as keys | ||
| with open(bugzilla.BUGS_DB, "r") as f: | ||
suhaibmujahid marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| for line in tqdm(f, total=num_lines, desc="Processing bugs"): | ||
| bug = json.loads(line) | ||
| bug_dict[bug.get("id")] = bug["resolution"] | ||
|
|
||
| return commit_dict, bug_to_commit_dict, bug_dict | ||
|
|
||
|
|
||
| def filter_commits( | ||
| commit_limit: int, | ||
| commit_dict: dict, | ||
| bug_to_commit_dict: dict, | ||
| bug_dict: dict, | ||
| ): | ||
| counter = 0 | ||
| commit_limit = min(commit_limit, 709458) | ||
| pbar = tqdm(total=commit_limit, desc="Filtering commits") | ||
suhaibmujahid marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| for commit in repository.get_commits( | ||
| include_no_bug=True, include_backouts=True, include_ignored=True | ||
| ): | ||
| # add commit if it was backed out and the bug is fixed | ||
| bug_info = bug_dict.get(commit["bug_id"]) | ||
suhaibmujahid marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| counter += 1 | ||
| pbar.update(1) | ||
| if commit["backedoutby"] and bug_info == "FIXED": | ||
| fixing_commit = find_next_commit( | ||
| commit["bug_id"], bug_to_commit_dict, commit["node"] | ||
| ) | ||
|
|
||
| # if fixing commit could not be found or is another backing out commit, do not add it to dataset | ||
suhaibmujahid marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if ( | ||
| fixing_commit["node"] == commit["backedoutby"] | ||
suhaibmujahid marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| or fixing_commit["backsout"] | ||
| ): | ||
| continue | ||
|
|
||
| # add the hashes of the bug-inducing commit, the back out commit, and the fixing commit | ||
| # include metadata such as push date and description for further context | ||
| yield { | ||
| "bug_id": commit["bug_id"], | ||
| "inducing_commit": { | ||
| "node": commit["node"], | ||
| "pushdate": commit["pushdate"], | ||
| "desc": commit["desc"], | ||
| }, | ||
| "backout_commit": { | ||
| "node": commit["backedoutby"], | ||
| "pushdate": commit_dict[commit["backedoutby"]]["pushdate"], | ||
| "desc": commit_dict[commit["backedoutby"]]["desc"], | ||
| }, | ||
| "fixing_commit": { | ||
| "node": fixing_commit["node"], | ||
| "pushdate": fixing_commit["pushdate"], | ||
| "desc": fixing_commit["desc"], | ||
| }, | ||
| } | ||
|
|
||
| if counter >= commit_limit: | ||
| break | ||
|
|
||
| pbar.close() | ||
|
|
||
|
|
||
| def find_next_commit(bug_id: int, bug_to_commit_dict: dict, inducing_node: str) -> dict: | ||
| inducing_commit_found = False | ||
| for commit in bug_to_commit_dict[bug_id]: | ||
| # if the inducing commit has been found, find next commit that has not been backed out | ||
| if inducing_commit_found: | ||
| if len(commit["backedoutby"]) == 0: | ||
| return commit | ||
|
|
||
| if commit["node"] == inducing_node: | ||
| inducing_commit_found = True | ||
|
|
||
| return commit | ||
|
|
||
|
|
||
| def save_dataset(directory_path: str, filename: str, data_generator): | ||
| if not os.path.exists(directory_path): | ||
| os.makedirs(directory_path) | ||
| logger.info(f"Directory {directory_path} created") | ||
|
|
||
| file_path = os.path.join(directory_path, filename) | ||
| with open(file_path, "w") as file: | ||
| file.write("[\n") | ||
| first = True | ||
| for item in data_generator: | ||
| if not first: | ||
| file.write(",\n") | ||
| json_data = json.dumps(item, indent=4) | ||
| file.write(json_data) | ||
| first = False | ||
| file.write("\n]") | ||
|
|
||
| logger.info(f"Data successfully saved to {file_path}") | ||
|
|
||
|
|
||
| def main(): | ||
| download_databases() | ||
|
|
||
| commit_dict, bug_to_commit_dict, bug_dict = preprocess_commits_and_bugs() | ||
|
||
|
|
||
| data_generator = filter_commits( | ||
| commit_limit=1000000, | ||
| commit_dict=commit_dict, | ||
| bug_to_commit_dict=bug_to_commit_dict, | ||
| bug_dict=bug_dict, | ||
| ) | ||
|
|
||
| save_dataset( | ||
| directory_path="dataset", | ||
| filename="backout_dataset.json", | ||
| data_generator=data_generator, | ||
| ) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
Uh oh!
There was an error while loading. Please reload this page.