Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions mirror/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
"""
__author__ = "Bugout"
__maintainer__ = __author__
__description__ = "Tools for software project analysis"
__description__ = "Tools for GitHub software project analysis"

__email__ = "engineering@bugout.dev"
__license__ = "MIT"
__version__ = "0.2.6"
__version__ = "0.2.7"

__all__ = (
"__author__",
Expand Down
3 changes: 1 addition & 2 deletions mirror/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import click

from . import __version__
from .github.allrepos import crawl_handler as crawl_populator
from .github.allrepos import nextid_handler as nextid_populator
Expand All @@ -8,8 +9,6 @@
from .github.search import popular_repos
from .github.clone_repos import clone_repos
from .github.generate_snippets import generate_datasets
from .github.sync import handler as sync_populator
from .github.licenses import licenses_handler as licenses_populator


@click.group()
Expand Down
20 changes: 20 additions & 0 deletions mirror/populate.py → mirror/cli_argp.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
import argparse
from typing import Callable, Dict

from .github import forks
from . import __version__


def populate_cli(
parser: argparse.ArgumentParser,
Expand All @@ -30,3 +33,20 @@ def populate_cli(
for subcommand, populator in subcommand_populators.items():
subparser = subcommand_parsers.add_parser(subcommand)
populator(subparser)


def main():
parser = argparse.ArgumentParser(
description="Mirror: Tools for GitHub software project analysis",
epilog=f"Version {__version__}",
)
subcommand = parser.add_subparsers(description="Mirror commands")

forks.mutate_argparser(subcommand)

args = parser.parse_args()
args.func(args)


if __name__ == "__main__":
main()
Empty file added mirror/db/__init__.py
Empty file.
File renamed without changes.
20 changes: 7 additions & 13 deletions mirror/github/allrepos.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,24 @@

Support checkpointing against a small state object - the integer ID of the last repository seen.
"""

import argparse
import csv
import json
import glob
import multiprocessing
import os
import random
import sys
import time
from typing import Any, Callable, Dict, Iterator, List, Optional, TextIO, Tuple
from typing import Any, Dict, Iterator, List, Tuple

import click

import requests
from tqdm import tqdm # type: ignore

from ..populate import populate_cli
from ..settings import GITHUB_TOKEN
from ..settings import GITHUB_API_URL, GITHUB_TOKEN, REMAINING_RATELIMIT_HEADER

subcommand = "allrepos"

REPOSITORIES_URL = "https://api.github.com/repositories"
REMAINING_RATELIMIT_HEADER = "X-RateLimit-Remaining"


def crawl(
start_id: int, max_id: int, interval: float, min_rate_limit: int
Expand Down Expand Up @@ -60,15 +53,16 @@ def crawl(
"Accept": "application/vnd.github.v3+json",
"User-Agent": "simiotics mirror",
}
github_token = GITHUB_TOKEN
if github_token is not None and github_token != "":
headers["Authorization"] = f"token {github_token}"
if GITHUB_TOKEN is not None and GITHUB_TOKEN != "":
headers["Authorization"] = f"token {GITHUB_TOKEN}"

since = start_id
curr_rate_limit = min_rate_limit + 10
while since is not None and since < max_id and curr_rate_limit > min_rate_limit:
time.sleep(interval)
r = requests.get(REPOSITORIES_URL, params={"since": since}, headers=headers)
r = requests.get(
f"{GITHUB_API_URL}/repositories", params={"since": since}, headers=headers
)
response_body = r.json()
if not response_body:
break
Expand Down
46 changes: 46 additions & 0 deletions mirror/github/calls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
Processing requests to GitHub API.
"""
import logging
from typing import Any, Dict, List, Union

import requests

from ..settings import GITHUB_API_URL, GITHUB_API_REQUEST_TIMEOUT

logger = logging.getLogger(__name__)


class GitHubApiCallFailed(Exception):
"""
Raised on actions that involve calls to GitHub API which are failed.
"""


def fetch_repository_forks(
owner: str,
repo: str,
sort: str = "newest",
per_page: int = 100,
page: int = 1,
) -> List[Dict[str, Any]]:
"""
Fetch forks for provided repository from GitHub.
"""
url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/forks"
headers = {"Accept": "application/vnd.github.v3+json"}
params: Dict[str, Union[str, int]] = {
"sort": sort,
"per_page": per_page,
"page": page,
}
try:
r = requests.get(
url, headers=headers, params=params, timeout=GITHUB_API_REQUEST_TIMEOUT
)
r.raise_for_status()
response = r.json()
except Exception as e:
logger.error(repr(e))
raise GitHubApiCallFailed("An error occurred due fetching forks via GitHub API")
return response
11 changes: 5 additions & 6 deletions mirror/github/clone_repos.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
"""
Cloning repository workflow.
"""
import os
import json
import time
import traceback
import subprocess
from typing import Optional

import click
import requests

from ..settings import module_version
from .. import __version__
from .utils import get_nearest_value, read_command_type, forward_languages_config

DATETIME_HEADER = "Date"


class CommandNotExistError(Exception):
"""Raised when coomand is not exist."""
Expand Down Expand Up @@ -59,7 +58,7 @@ def create_dir_meta_if_not_exists(lang_path: str, meta_file: str, lang: str):
"language": lang,
"repos": [],
"crawled_at": None,
"mirror version": module_version,
"mirror version": __version__,
},
meta,
)
Expand Down
23 changes: 7 additions & 16 deletions mirror/github/commits.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,20 @@
import re
"""
Process commits for repository.
"""
import os
import csv
import sys
import json
import time
import glob
import zipfile
import string
import traceback
from pathlib import Path
from typing import Optional

from .utils import flatten_json, get_nearest_value

import requests
import click

from ..settings import GITHUB_TOKEN
from .. import settings
from .utils import write_with_size, read_command_type, request_with_limit
from .data import CommitPublic


DATETIME_HEADER = "Date"


validate_models = {"CommitPublic": CommitPublic}


Expand Down Expand Up @@ -204,13 +195,13 @@ def commits(
os.makedirs(crawldir)

if not token:
token = GITHUB_TOKEN
token = settings.GITHUB_TOKEN

headers = {
"accept": "application/vnd.github.v3+json",
}

if GITHUB_TOKEN is not None:
if settings.GITHUB_TOKEN is not None:
headers["Authorization"] = f"token {token}"
else:
click.echo(f"start with low rate limit")
Expand Down Expand Up @@ -264,7 +255,7 @@ def commits(
license = repo["license"]

# date of creating that commits file
date = commits_responce.headers.get(DATETIME_HEADER)
date = commits_responce.headers.get(settings.DATETIME_HEADER)

# Indexing
writer.writerow(
Expand Down
20 changes: 17 additions & 3 deletions mirror/github/data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# pylint: disable=no-name-in-module
# pylint: disable=no-self-argument

from datetime import datetime
from typing import Any, Dict, List, Optional
from typing import List, Optional

from pydantic import BaseModel, Field

Expand All @@ -19,3 +17,19 @@ class CommitPublic(MyBaseModel):
html_url: Optional[str] = None
author_html_url: Optional[str] = None
committer_html_url: Optional[str] = None


class RepositoryFork(BaseModel):
name: Optional[str] = None
full_name: Optional[str] = None
owner: Optional[str] = None
html_url: Optional[str] = None
forks_count: Optional[str] = None
created_at: Optional[str] = None
updated_at: Optional[str] = None


class RepositoryForksList(BaseModel):
owner: str
repo: str
forks: List[RepositoryFork] = Field(default_factory=set)
71 changes: 71 additions & 0 deletions mirror/github/forks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import argparse
import logging
import time
from typing import List

from . import calls
from .data import RepositoryFork, RepositoryForksList

logger = logging.getLogger(__name__)


def get_repository_forks(
owner: str, repo: str, sleep_interval: int = 1
) -> RepositoryForksList:
"""
Parse repository forks and return organized pydantic data.
"""
forks: List[RepositoryFork] = []

page = 1
while True:
try:
time.sleep(sleep_interval)
forks_raw = calls.fetch_repository_forks(
owner=owner, repo=repo, per_page=100, page=page
)
for fork_raw in forks_raw:
owner_dict = fork_raw.get("owner")
forks.append(
RepositoryFork(
name=fork_raw.get("name"),
full_name=fork_raw.get("full_name"),
owner=owner_dict.get("login")
if owner_dict is not None
else None,
html_url=fork_raw.get("html_url"),
forks_count=fork_raw.get("forks_count"),
created_at=fork_raw.get("created_at"),
updated_at=fork_raw.get("updated_at"),
)
)
if len(forks_raw) == 0:
logger.info(
f"Parsing of repository forks finished, total number of forks: {len(forks)}"
)
break
except Exception:
logger.error(f"Unexpected error occurred due parsing repository forks")
break
page += 1

return RepositoryForksList(owner=owner, repo=repo, forks=forks)


def cli_forks_handler(args: argparse.Namespace) -> None:
forks = get_repository_forks(args.owner, args.repo)
print(forks.json())


def mutate_argparser(subcommand) -> None:
"""
Mutates the provided parser with GitHub Forks functionality.
"""
parser_forks = subcommand.add_parser("forks", description="Mirror forks")
parser_forks.add_argument(
"-o", "--owner", required=True, help="GitHub username or organization name"
)
parser_forks.add_argument(
"-r", "--repo", required=True, help="GitHub repository name"
)
parser_forks.set_defaults(func=cli_forks_handler)
Loading