|
| 1 | +""" |
| 2 | +Generate a report of actual revert commits in pytorch/pytorch over a given period |
| 3 | +and annotate whether each has a matching non-dry-run autorevert decision recorded |
| 4 | +by misc.autorevert_events_v2. |
| 5 | +
|
| 6 | +Columns: |
| 7 | +- revert_time (UTC) |
| 8 | +- original_sha (the commit being reverted) |
| 9 | +- category (from -c flag in the bot command comment, else from message, else 'uncategorized') |
| 10 | +- reason (short reason parsed from the revert commit message, if present) |
| 11 | +- author (GitHub login from the bot command 'on behalf of' attribution in the message) |
| 12 | +- comment_url (link to the bot command comment if present) |
| 13 | +- has_autorevert (yes/no) — whether misc.autorevert_events_v2 recorded a revert for original_sha |
| 14 | +
|
| 15 | +Usage examples: |
| 16 | +- python -m pytorch_auto_revert.testers.actual_reverts_report --start "2025-09-16 22:18:51" --end "2025-09-24 00:00:00" |
| 17 | +- python -m pytorch_auto_revert.testers.actual_reverts_report --start "2025-09-16 22:18:51" --format csv > reverts.csv |
| 18 | +
|
| 19 | +This script uses the ClickHouse client configuration from environment variables |
| 20 | +as done by the project entrypoint (CLICKHOUSE_HOST, CLICKHOUSE_PORT, CLICKHOUSE_USERNAME, |
| 21 | +CLICKHOUSE_PASSWORD, CLICKHOUSE_DATABASE). You may also use a .env file. |
| 22 | +""" |
| 23 | + |
| 24 | +from __future__ import annotations |
| 25 | + |
| 26 | +import argparse |
| 27 | +import csv |
| 28 | +import os |
| 29 | +from datetime import datetime, timezone |
| 30 | +from typing import Any, Iterable, List, Tuple |
| 31 | + |
| 32 | +from dotenv import load_dotenv |
| 33 | + |
| 34 | +from ..clickhouse_client_helper import CHCliFactory, ensure_utc_datetime |
| 35 | + |
| 36 | + |
| 37 | +def parse_utc(s: str) -> datetime: |
| 38 | + """Parse a timestamp as UTC. Supports naive (assumed UTC) or TZ-aware strings.""" |
| 39 | + # Allow common formats: ISO8601 or "YYYY-MM-DD HH:MM:SS" |
| 40 | + try: |
| 41 | + dt = datetime.fromisoformat(s) |
| 42 | + except ValueError: |
| 43 | + dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S") |
| 44 | + return ensure_utc_datetime(dt) |
| 45 | + |
| 46 | + |
| 47 | +def setup_ch_from_env() -> None: |
| 48 | + host = os.environ.get("CLICKHOUSE_HOST", "") |
| 49 | + port = int(os.environ.get("CLICKHOUSE_PORT", "8443")) |
| 50 | + username = os.environ.get("CLICKHOUSE_USERNAME", "") |
| 51 | + password = os.environ.get("CLICKHOUSE_PASSWORD", "") |
| 52 | + database = os.environ.get("CLICKHOUSE_DATABASE", "default") |
| 53 | + CHCliFactory.setup_client(host, port, username, password, database) |
| 54 | + |
| 55 | + |
| 56 | +def run_query( |
| 57 | + start: datetime, end: datetime |
| 58 | +) -> Tuple[List[str], List[Tuple[Any, ...]]]: |
| 59 | + """Run the ClickHouse query and return (headers, rows).""" |
| 60 | + client = CHCliFactory().client |
| 61 | + |
| 62 | + sql = """ |
| 63 | + WITH |
| 64 | + toDateTime64({start:DateTime64(9)}, 9) AS start_ts, |
| 65 | + toDateTime64({end:DateTime64(9)}, 9) AS end_ts |
| 66 | +
|
| 67 | + -- 1) Per-revert-commit rows (only bot-driven commits with author + comment id) |
| 68 | + , revert_by_sha AS ( |
| 69 | + SELECT |
| 70 | + commit.id AS revert_sha, |
| 71 | + min(commit.timestamp) AS revert_time, |
| 72 | + anyHeavy(commit.message) AS message, |
| 73 | + regexpExtract(message, '(?s)This reverts commit ([0-9a-fA-F]{40})', 1) AS original_sha, |
| 74 | + regexpExtract(message, '(?s)on behalf of https://github.com/([A-Za-z0-9-]+)', 1) AS command_author, |
| 75 | + nullIf(trim(BOTH ' ' FROM regexpExtract(message, '(?s)due to (.*?)(?: \\([[]comment|$)', 1)), '') AS reason, |
| 76 | + regexpExtract(message, |
| 77 | + '(?s)\\[comment\\]\\((https://github.com/pytorch/pytorch/pull/\\d+#issuecomment-\\d+)\\)', 1 |
| 78 | + ) AS comment_url, |
| 79 | + toInt64OrNull(regexpExtract(message, '#issuecomment-(\\d+)', 1)) AS comment_id, |
| 80 | + regexpExtract(message, '-c\\s+(\\w+)', 1) AS category_hint |
| 81 | + FROM default.push |
| 82 | + ARRAY JOIN commits AS commit |
| 83 | + WHERE tupleElement(repository, 'full_name') = 'pytorch/pytorch' |
| 84 | + AND commit.timestamp >= start_ts AND commit.timestamp < end_ts |
| 85 | + AND match(commit.message, '(?s)This reverts commit [0-9a-fA-F]{40}') |
| 86 | + GROUP BY commit.id |
| 87 | + HAVING comment_id IS NOT NULL AND command_author != '' |
| 88 | + ) |
| 89 | +
|
| 90 | + -- 2) Join comment to get authoritative category from -c flag |
| 91 | + , revert_enriched AS ( |
| 92 | + SELECT |
| 93 | + r.revert_sha, |
| 94 | + r.revert_time, |
| 95 | + r.original_sha, |
| 96 | + r.command_author, |
| 97 | + r.reason, |
| 98 | + r.comment_url, |
| 99 | + lowerUTF8(nullIf(ic.ic_body_category, '')) AS comment_category, |
| 100 | + lowerUTF8(nullIf(r.category_hint, '')) AS message_category |
| 101 | + FROM revert_by_sha AS r |
| 102 | + LEFT JOIN ( |
| 103 | + SELECT id, regexpExtract(body, '-c\\s+(\\w+)', 1) AS ic_body_category |
| 104 | + FROM default.issue_comment |
| 105 | + ) AS ic |
| 106 | + ON ic.id = r.comment_id |
| 107 | + ) |
| 108 | +
|
| 109 | + -- 3) Aggregate to one row per original_sha (earliest revert attempt) |
| 110 | + , per_original AS ( |
| 111 | + SELECT |
| 112 | + original_sha, |
| 113 | + argMin(tuple( |
| 114 | + revert_time, |
| 115 | + reason, |
| 116 | + command_author, |
| 117 | + comment_url, |
| 118 | + if(comment_category IN ('nosignal','ignoredsignal','landrace','weird','ghfirst'), comment_category, |
| 119 | + if(message_category IN ('nosignal','ignoredsignal','landrace','weird','ghfirst'), message_category, |
| 120 | + 'uncategorized')) |
| 121 | + ), revert_time) AS fields |
| 122 | + FROM revert_enriched |
| 123 | + GROUP BY original_sha |
| 124 | + ) |
| 125 | +
|
| 126 | + -- 4) Autorevert decisions (non-dry-run) keyed by original sha |
| 127 | + , auto AS ( |
| 128 | + SELECT commit_sha |
| 129 | + FROM misc.autorevert_events_v2 |
| 130 | + WHERE dry_run = 0 AND action = 'revert' |
| 131 | + GROUP BY commit_sha |
| 132 | + ) |
| 133 | +
|
| 134 | + SELECT |
| 135 | + tupleElement(fields, 1) AS revert_time, |
| 136 | + original_sha, |
| 137 | + tupleElement(fields, 5) AS category, |
| 138 | + tupleElement(fields, 2) AS reason, |
| 139 | + tupleElement(fields, 3) AS author, |
| 140 | + tupleElement(fields, 4) AS comment_url, |
| 141 | + if(auto.commit_sha != '', 'yes', 'no') AS has_autorevert |
| 142 | + FROM per_original |
| 143 | + LEFT JOIN auto ON auto.commit_sha = per_original.original_sha |
| 144 | + ORDER BY revert_time |
| 145 | + """ |
| 146 | + |
| 147 | + res = client.query(sql, parameters={"start": start, "end": end}) |
| 148 | + headers = [ |
| 149 | + "revert_time", |
| 150 | + "original_sha", |
| 151 | + "category", |
| 152 | + "reason", |
| 153 | + "author", |
| 154 | + "comment_url", |
| 155 | + "has_autorevert", |
| 156 | + ] |
| 157 | + rows = [tuple(row) for row in res.result_rows] |
| 158 | + return headers, rows |
| 159 | + |
| 160 | + |
| 161 | +def print_table(headers: List[str], rows: List[Tuple[Any, ...]]) -> None: |
| 162 | + # Pretty print with simple width calculation and trimming long cells |
| 163 | + widths = [len(h) for h in headers] |
| 164 | + # Cap for reason and URL columns to keep output readable |
| 165 | + caps = {headers.index("reason"): 100, headers.index("comment_url"): 120} |
| 166 | + for row in rows: |
| 167 | + for i, val in enumerate(row): |
| 168 | + sval = "" if val is None else str(val) |
| 169 | + if i in caps and len(sval) > caps[i]: |
| 170 | + sval = sval[: caps[i] - 1] + "…" |
| 171 | + widths[i] = max(widths[i], len(sval)) |
| 172 | + |
| 173 | + def fmt_row(vals: Iterable[Any]) -> str: |
| 174 | + parts: List[str] = [] |
| 175 | + for i, v in enumerate(vals): |
| 176 | + sval = "" if v is None else str(v) |
| 177 | + if i in caps and len(sval) > caps[i]: |
| 178 | + sval = sval[: caps[i] - 1] + "…" |
| 179 | + parts.append(sval.ljust(widths[i])) |
| 180 | + return " | ".join(parts) |
| 181 | + |
| 182 | + print(fmt_row(headers)) |
| 183 | + print("-+-".join("-" * w for w in widths)) |
| 184 | + for row in rows: |
| 185 | + print(fmt_row(row)) |
| 186 | + |
| 187 | + |
| 188 | +def write_csv(headers: List[str], rows: List[Tuple[Any, ...]], fp) -> None: |
| 189 | + writer = csv.writer(fp) |
| 190 | + writer.writerow(headers) |
| 191 | + for row in rows: |
| 192 | + writer.writerow(row) |
| 193 | + |
| 194 | + |
| 195 | +def main() -> None: |
| 196 | + load_dotenv() |
| 197 | + |
| 198 | + ap = argparse.ArgumentParser( |
| 199 | + description="Build actual reverts table for a date range (UTC)" |
| 200 | + ) |
| 201 | + ap.add_argument( |
| 202 | + "--start", required=True, help="Start time UTC (e.g. '2025-09-16 22:18:51')" |
| 203 | + ) |
| 204 | + ap.add_argument("--end", default=None, help="End time UTC (default: now)") |
| 205 | + ap.add_argument( |
| 206 | + "--format", choices=["table", "csv"], default="table", help="Output format" |
| 207 | + ) |
| 208 | + args = ap.parse_args() |
| 209 | + |
| 210 | + start = parse_utc(args.start) |
| 211 | + end = parse_utc(args.end) if args.end else datetime.now(timezone.utc) |
| 212 | + |
| 213 | + setup_ch_from_env() |
| 214 | + headers, rows = run_query(start, end) |
| 215 | + |
| 216 | + if args.format == "csv": |
| 217 | + write_csv(headers, rows, fp=os.sys.stdout) |
| 218 | + else: |
| 219 | + print_table(headers, rows) |
| 220 | + |
| 221 | + |
| 222 | +if __name__ == "__main__": |
| 223 | + main() |
0 commit comments