Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Tests and Linting

on:
pull_request:
branches: [main]

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.14.2"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -e ".[dev]"
- name: Run all tests
run: pytest
Comment on lines +9 to +21

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium test

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {contents: read}
48 changes: 28 additions & 20 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@
import logging
import os
import re
import requests
import sys
import time
from datetime import datetime, timezone
from typing import Iterator, Optional
from urllib.parse import parse_qs, urlparse
from google.cloud import bigquery

import requests
from google.api_core.client_options import ClientOptions
from google.auth.credentials import AnonymousCredentials

from google.cloud import bigquery

BUG_RE = re.compile(r"\b(?:bug|b=)\s*#?(\d+)\b", re.I)

Expand All @@ -29,6 +29,7 @@ def setup_logging() -> None:
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[logging.StreamHandler(sys.stdout)],
force=True,
)


Expand Down Expand Up @@ -58,7 +59,7 @@ def extract_pull_requests(
# Support custom API URL for mocking/testing
api_base = github_api_url or "https://api.github.com"
base_url = f"{api_base}/repos/{repo}/pulls"
params = {
params: dict = {
"state": "all",
"per_page": chunk_size,
"sort": "created",
Expand Down Expand Up @@ -90,7 +91,7 @@ def extract_pull_requests(
f"Extracted page {pages} with {len(batch)} PRs (total: {total})"
)

for idx, pr in enumerate(batch):
for _idx, pr in enumerate(batch):
pr_number = pr.get("number")
if not pr_number:
continue
Expand Down Expand Up @@ -272,7 +273,7 @@ def extract_comments(
return comments


def sleep_for_rate_limit(resp):
def sleep_for_rate_limit(resp: requests.Response) -> None:
"""Sleep until rate limit resets."""
remaining = int(resp.headers.get("X-RateLimit-Remaining", 1))
reset = int(resp.headers.get("X-RateLimit-Reset", 0))
Expand All @@ -297,7 +298,7 @@ def transform_data(raw_data: list[dict], repo: str) -> dict:
logger = logging.getLogger(__name__)
logger.info(f"Starting data transformation for {len(raw_data)} PRs")

transformed_data = {
transformed_data: dict = {
"pull_requests": [],
"commits": [],
"reviewers": [],
Expand All @@ -324,9 +325,11 @@ def transform_data(raw_data: list[dict], repo: str) -> dict:
"bug_id": bug_id,
"date_landed": pr.get("merged_at"),
"date_approved": None, # This will be filled later
"labels": [label.get("name") for label in pr.get("labels", [])]
if pr.get("labels")
else [],
"labels": (
[label.get("name") for label in pr.get("labels", [])]
if pr.get("labels")
else []
),
}

# Extract and flatten commit data
Expand Down Expand Up @@ -368,7 +371,8 @@ def transform_data(raw_data: list[dict], repo: str) -> dict:
}
transformed_data["reviewers"].append(transformed_reviewer)

# If the request is approved then store the date in the date_approved for the pull request
# If the request is approved then store the date in the
# date_approved for the pull request
if review.get("state") == "APPROVED":
approved_date = review.get("submitted_at")
if transformed_pr.get(
Expand All @@ -386,9 +390,9 @@ def transform_data(raw_data: list[dict], repo: str) -> dict:
"date_created": comment.get("created_at"),
"author_email": None, # TODO Placeholder for reviewer email extraction logic
"author_username": comment.get("user", {}).get("login"),
"character_count": len(comment.get("body", ""))
if comment.get("body")
else 0,
"character_count": (
len(comment.get("body", "")) if comment.get("body") else 0
),
"status": None, # TODO
}

Expand Down Expand Up @@ -419,7 +423,8 @@ def load_data(
Args:
client: BigQuery client instance
dataset_id: BigQuery dataset ID
transformed_data: Dictionary containing tables ('pull_requests', 'commits', 'reviewers', 'comments') mapped to lists of row dictionaries
transformed_data: Dictionary containing tables ('pull_requests',
'commits', 'reviewers', 'comments') mapped to lists of row dictionaries
"""
logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -454,7 +459,8 @@ def load_data(
raise Exception(error_msg)

logger.info(
f"Data loading completed successfully for table {table} with {len(load_table_data)} rows"
f"Data loading completed successfully for table {table} "
+ f"with {len(load_table_data)} rows"
)


Expand All @@ -476,7 +482,8 @@ def main() -> int:
github_token = os.environ.get("GITHUB_TOKEN")
if not github_token:
logger.warning(
"Warning: No token provided. You will hit very low rate limits and private repos won't work."
"Warning: No token provided. You will hit very low rate "
+ "limits and private repos won't work."
)

# Read BigQuery configuration
Expand Down Expand Up @@ -519,9 +526,10 @@ def main() -> int:
bigquery_client = bigquery.Client(project=bigquery_project)

# Read GitHub repository configuration
github_repos = os.getenv("GITHUB_REPOS")
if github_repos:
github_repos = github_repos.split(",")
github_repos = []
github_repos_str = os.getenv("GITHUB_REPOS")
if github_repos_str:
github_repos = github_repos_str.split(",")
else:
raise SystemExit(
"Environment variable GITHUB_REPOS is required (format: 'owner/repo,owner/repo')"
Expand Down
Loading