diff --git a/mirror/__init__.py b/mirror/__init__.py index ae889c6..6f2c04a 100644 --- a/mirror/__init__.py +++ b/mirror/__init__.py @@ -3,11 +3,11 @@ """ __author__ = "Bugout" __maintainer__ = __author__ -__description__ = "Tools for software project analysis" +__description__ = "Tools for GitHub software project analysis" __email__ = "engineering@bugout.dev" __license__ = "MIT" -__version__ = "0.2.6" +__version__ = "0.2.7" __all__ = ( "__author__", diff --git a/mirror/cli.py b/mirror/cli.py index 45922d2..611fa36 100644 --- a/mirror/cli.py +++ b/mirror/cli.py @@ -1,4 +1,5 @@ import click + from . import __version__ from .github.allrepos import crawl_handler as crawl_populator from .github.allrepos import nextid_handler as nextid_populator @@ -8,8 +9,6 @@ from .github.search import popular_repos from .github.clone_repos import clone_repos from .github.generate_snippets import generate_datasets -from .github.sync import handler as sync_populator -from .github.licenses import licenses_handler as licenses_populator @click.group() diff --git a/mirror/populate.py b/mirror/cli_argp.py similarity index 69% rename from mirror/populate.py rename to mirror/cli_argp.py index 8b80ac3..1567939 100644 --- a/mirror/populate.py +++ b/mirror/cli_argp.py @@ -6,6 +6,9 @@ import argparse from typing import Callable, Dict +from .github import forks +from . import __version__ + def populate_cli( parser: argparse.ArgumentParser, @@ -30,3 +33,20 @@ def populate_cli( for subcommand, populator in subcommand_populators.items(): subparser = subcommand_parsers.add_parser(subcommand) populator(subparser) + + +def main(): + parser = argparse.ArgumentParser( + description="Mirror: Tools for GitHub software project analysis", + epilog=f"Version {__version__}", + ) + subcommand = parser.add_subparsers(description="Mirror commands") + + forks.mutate_argparser(subcommand) + + args = parser.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/mirror/db/__init__.py b/mirror/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirror/github/db_tool.py b/mirror/db/db.py similarity index 100% rename from mirror/github/db_tool.py rename to mirror/db/db.py diff --git a/mirror/github/allrepos.py b/mirror/github/allrepos.py index 336272f..407f478 100644 --- a/mirror/github/allrepos.py +++ b/mirror/github/allrepos.py @@ -3,9 +3,6 @@ Support checkpointing against a small state object - the integer ID of the last repository seen. """ - -import argparse -import csv import json import glob import multiprocessing @@ -13,21 +10,17 @@ import random import sys import time -from typing import Any, Callable, Dict, Iterator, List, Optional, TextIO, Tuple +from typing import Any, Dict, Iterator, List, Tuple import click import requests from tqdm import tqdm # type: ignore -from ..populate import populate_cli -from ..settings import GITHUB_TOKEN +from ..settings import GITHUB_API_URL, GITHUB_TOKEN, REMAINING_RATELIMIT_HEADER subcommand = "allrepos" -REPOSITORIES_URL = "https://api.github.com/repositories" -REMAINING_RATELIMIT_HEADER = "X-RateLimit-Remaining" - def crawl( start_id: int, max_id: int, interval: float, min_rate_limit: int @@ -60,15 +53,16 @@ def crawl( "Accept": "application/vnd.github.v3+json", "User-Agent": "simiotics mirror", } - github_token = GITHUB_TOKEN - if github_token is not None and github_token != "": - headers["Authorization"] = f"token {github_token}" + if GITHUB_TOKEN is not None and GITHUB_TOKEN != "": + headers["Authorization"] = f"token {GITHUB_TOKEN}" since = start_id curr_rate_limit = min_rate_limit + 10 while since is not None and since < max_id and curr_rate_limit > min_rate_limit: time.sleep(interval) - r = requests.get(REPOSITORIES_URL, params={"since": since}, headers=headers) + r = requests.get( + f"{GITHUB_API_URL}/repositories", params={"since": since}, headers=headers + ) response_body = r.json() if not response_body: break diff --git a/mirror/github/calls.py b/mirror/github/calls.py new file mode 100644 index 0000000..2fd085f --- /dev/null +++ b/mirror/github/calls.py @@ -0,0 +1,46 @@ +""" +Processing requests to GitHub API. +""" +import logging +from typing import Any, Dict, List, Union + +import requests + +from ..settings import GITHUB_API_URL, GITHUB_API_REQUEST_TIMEOUT + +logger = logging.getLogger(__name__) + + +class GitHubApiCallFailed(Exception): + """ + Raised on actions that involve calls to GitHub API which are failed. + """ + + +def fetch_repository_forks( + owner: str, + repo: str, + sort: str = "newest", + per_page: int = 100, + page: int = 1, +) -> List[Dict[str, Any]]: + """ + Fetch forks for provided repository from GitHub. + """ + url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/forks" + headers = {"Accept": "application/vnd.github.v3+json"} + params: Dict[str, Union[str, int]] = { + "sort": sort, + "per_page": per_page, + "page": page, + } + try: + r = requests.get( + url, headers=headers, params=params, timeout=GITHUB_API_REQUEST_TIMEOUT + ) + r.raise_for_status() + response = r.json() + except Exception as e: + logger.error(repr(e)) + raise GitHubApiCallFailed("An error occurred due fetching forks via GitHub API") + return response diff --git a/mirror/github/clone_repos.py b/mirror/github/clone_repos.py index d8c1588..67ead8c 100644 --- a/mirror/github/clone_repos.py +++ b/mirror/github/clone_repos.py @@ -1,18 +1,17 @@ +""" +Cloning repository workflow. +""" import os import json -import time import traceback import subprocess from typing import Optional import click -import requests -from ..settings import module_version +from .. import __version__ from .utils import get_nearest_value, read_command_type, forward_languages_config -DATETIME_HEADER = "Date" - class CommandNotExistError(Exception): """Raised when coomand is not exist.""" @@ -59,7 +58,7 @@ def create_dir_meta_if_not_exists(lang_path: str, meta_file: str, lang: str): "language": lang, "repos": [], "crawled_at": None, - "mirror version": module_version, + "mirror version": __version__, }, meta, ) diff --git a/mirror/github/commits.py b/mirror/github/commits.py index 973e0f0..2e626d1 100644 --- a/mirror/github/commits.py +++ b/mirror/github/commits.py @@ -1,29 +1,20 @@ -import re +""" +Process commits for repository. +""" import os import csv -import sys import json -import time -import glob import zipfile -import string -import traceback -from pathlib import Path from typing import Optional from .utils import flatten_json, get_nearest_value -import requests import click -from ..settings import GITHUB_TOKEN +from .. import settings from .utils import write_with_size, read_command_type, request_with_limit from .data import CommitPublic - -DATETIME_HEADER = "Date" - - validate_models = {"CommitPublic": CommitPublic} @@ -204,13 +195,13 @@ def commits( os.makedirs(crawldir) if not token: - token = GITHUB_TOKEN + token = settings.GITHUB_TOKEN headers = { "accept": "application/vnd.github.v3+json", } - if GITHUB_TOKEN is not None: + if settings.GITHUB_TOKEN is not None: headers["Authorization"] = f"token {token}" else: click.echo(f"start with low rate limit") @@ -264,7 +255,7 @@ def commits( license = repo["license"] # date of creating that commits file - date = commits_responce.headers.get(DATETIME_HEADER) + date = commits_responce.headers.get(settings.DATETIME_HEADER) # Indexing writer.writerow( diff --git a/mirror/github/data.py b/mirror/github/data.py index c97a537..d76c4ce 100644 --- a/mirror/github/data.py +++ b/mirror/github/data.py @@ -1,8 +1,6 @@ # pylint: disable=no-name-in-module # pylint: disable=no-self-argument - -from datetime import datetime -from typing import Any, Dict, List, Optional +from typing import List, Optional from pydantic import BaseModel, Field @@ -19,3 +17,19 @@ class CommitPublic(MyBaseModel): html_url: Optional[str] = None author_html_url: Optional[str] = None committer_html_url: Optional[str] = None + + +class RepositoryFork(BaseModel): + name: Optional[str] = None + full_name: Optional[str] = None + owner: Optional[str] = None + html_url: Optional[str] = None + forks_count: Optional[str] = None + created_at: Optional[str] = None + updated_at: Optional[str] = None + + +class RepositoryForksList(BaseModel): + owner: str + repo: str + forks: List[RepositoryFork] = Field(default_factory=set) diff --git a/mirror/github/forks.py b/mirror/github/forks.py new file mode 100644 index 0000000..47322ca --- /dev/null +++ b/mirror/github/forks.py @@ -0,0 +1,71 @@ +import argparse +import logging +import time +from typing import List + +from . import calls +from .data import RepositoryFork, RepositoryForksList + +logger = logging.getLogger(__name__) + + +def get_repository_forks( + owner: str, repo: str, sleep_interval: int = 1 +) -> RepositoryForksList: + """ + Parse repository forks and return organized pydantic data. + """ + forks: List[RepositoryFork] = [] + + page = 1 + while True: + try: + time.sleep(sleep_interval) + forks_raw = calls.fetch_repository_forks( + owner=owner, repo=repo, per_page=100, page=page + ) + for fork_raw in forks_raw: + owner_dict = fork_raw.get("owner") + forks.append( + RepositoryFork( + name=fork_raw.get("name"), + full_name=fork_raw.get("full_name"), + owner=owner_dict.get("login") + if owner_dict is not None + else None, + html_url=fork_raw.get("html_url"), + forks_count=fork_raw.get("forks_count"), + created_at=fork_raw.get("created_at"), + updated_at=fork_raw.get("updated_at"), + ) + ) + if len(forks_raw) == 0: + logger.info( + f"Parsing of repository forks finished, total number of forks: {len(forks)}" + ) + break + except Exception: + logger.error(f"Unexpected error occurred due parsing repository forks") + break + page += 1 + + return RepositoryForksList(owner=owner, repo=repo, forks=forks) + + +def cli_forks_handler(args: argparse.Namespace) -> None: + forks = get_repository_forks(args.owner, args.repo) + print(forks.json()) + + +def mutate_argparser(subcommand) -> None: + """ + Mutates the provided parser with GitHub Forks functionality. + """ + parser_forks = subcommand.add_parser("forks", description="Mirror forks") + parser_forks.add_argument( + "-o", "--owner", required=True, help="GitHub username or organization name" + ) + parser_forks.add_argument( + "-r", "--repo", required=True, help="GitHub repository name" + ) + parser_forks.set_defaults(func=cli_forks_handler) diff --git a/mirror/github/generate_snippets.py b/mirror/github/generate_snippets.py index 93e651c..afdd9de 100644 --- a/mirror/github/generate_snippets.py +++ b/mirror/github/generate_snippets.py @@ -1,3 +1,6 @@ +""" +Snippets generator. +""" import base64 from collections import defaultdict from datetime import datetime @@ -10,7 +13,8 @@ import click -from . import db_tool +from ..db import db +from .. import __version__ from .. import settings @@ -222,9 +226,9 @@ def generate_datasets( rows_step = chunksize if not clone_dir: - clone_dir = os.environ.get("CLONE_DIR") - if not clone_dir: - raise ReadReposDirectoryError("CLONE_DIR not set.") + clone_dir = settings.CLONE_DIR + if clone_dir is None: + raise ReadReposDirectoryError("CLONE_DIR environment variable must be set") # Read languages config file try: @@ -257,8 +261,8 @@ def generate_datasets( os.makedirs(snippets_dir) # Create connection - conn = db_tool.create_connection(os.path.join(snippets_dir, "snippets.db")) - db_tool.create_snippets_table(conn) + conn = db.create_connection(os.path.join(snippets_dir, "snippets.db")) + db.create_snippets_table(conn) crawled_repos: Dict[str, Dict[str, Union[str, None]]] = {} @@ -317,7 +321,7 @@ def generate_datasets( for chunk_data in chunks ] - db_tool.write_snippet_to_db(conn, batch) + db.write_snippet_to_db(conn, batch) if not chunks: break @@ -335,7 +339,7 @@ def generate_datasets( json.dump( { - "mirror version": settings.module_version, + "mirror version": __version__, "date": f"{datetime.now()}", "languages init config": language_to_extensions, "chunksize": chunksize, diff --git a/mirror/github/licenses.py b/mirror/github/licenses.py index 521887c..ccd10d9 100644 --- a/mirror/github/licenses.py +++ b/mirror/github/licenses.py @@ -1,8 +1,6 @@ """ -Collect license information for a repository or a list of repositories +Collect license information for a repository or a list of repositories. """ - -import argparse import json import os import sys @@ -11,7 +9,8 @@ from typing import Any, Dict, List import requests -from tqdm import tqdm # type: ignore + +from .. import settings subcommand = "licenses" @@ -33,7 +32,7 @@ def get_license(repo_api_url: str) -> Dict[str, Any]: "Accept": "application/vnd.github.v3+json", "User-Agent": "simiotics mirror", } - github_token = os.environ.get("GITHUB_TOKEN") + github_token = settings.GITHUB_TOKEN if github_token is not None and github_token != "": headers["Authorization"] = f"token {github_token}" diff --git a/mirror/github/search.py b/mirror/github/search.py index b67625d..e9762a4 100644 --- a/mirror/github/search.py +++ b/mirror/github/search.py @@ -1,24 +1,19 @@ +""" +Popular repositories search engine. +""" import os -import csv import json -import time import string import traceback import urllib.parse -from pathlib import Path -from typing import Optional, Tuple - +from typing import Optional import click -import requests -from ..settings import * +from .. import settings from .utils import forward_languages_config, request_with_limit -DATETIME_HEADER = "Date" - - class Error(Exception): """Base class for exceptions in this module.""" @@ -128,13 +123,13 @@ def popular_repos( """ if not token: - token = GITHUB_TOKEN + token = settings.GITHUB_TOKEN headers = { "accept": "application/vnd.github.v3+json", } - if GITHUB_TOKEN is not None: + if settings.GITHUB_TOKEN is not None: headers["Authorization"] = f"token {token}" else: click.echo(f"start with low rate limit") @@ -207,7 +202,7 @@ def popular_repos( write_repos( data, alredy_parsed, - search_response.headers.get(DATETIME_HEADER), + search_response.headers.get(settings.DATETIME_HEADER), files_counter, crawldir, language, diff --git a/mirror/github/sync.py b/mirror/github/sync.py index ceef18d..74cae82 100644 --- a/mirror/github/sync.py +++ b/mirror/github/sync.py @@ -1,16 +1,15 @@ """ -Synchronize repository metadata into a SQLite database +Synchronize repository metadata with local SQLite database. """ - -import argparse from datetime import datetime, timezone import json import sqlite3 import sys from typing import Any, Dict, Iterator, List, Optional, Tuple -from tqdm import tqdm # type: ignore import click +from tqdm import tqdm # type: ignore + from .allrepos import ordered_crawl diff --git a/mirror/github/utils.py b/mirror/github/utils.py index 89e0560..54434cb 100644 --- a/mirror/github/utils.py +++ b/mirror/github/utils.py @@ -7,8 +7,7 @@ import click import requests -REMAINING_RATELIMIT_HEADER = "X-RateLimit-Remaining" -X_RATELIMIT_RESET = "X-RateLimit-Reset" +from .. import settings def request_with_limit(url, headers, min_rate_limit): @@ -17,12 +16,12 @@ def request_with_limit(url, headers, min_rate_limit): response = requests.get(url, headers=headers) - rate_limit_raw = response.headers.get(REMAINING_RATELIMIT_HEADER) + rate_limit_raw = response.headers.get(settings.REMAINING_RATELIMIT_HEADER) if rate_limit_raw is not None: current_rate_limit = int(rate_limit_raw) if current_rate_limit <= min_rate_limit: - reset_time = response.headers.get(X_RATELIMIT_RESET) + reset_time = response.headers.get(settings.RESET_RATELIMIT_HEADER) time.sleep(abs(int(reset_time) - int(time.time())) + 1) else: break diff --git a/mirror/settings.py b/mirror/settings.py index 48ae83c..9c748ab 100644 --- a/mirror/settings.py +++ b/mirror/settings.py @@ -1,15 +1,15 @@ import os -import uuid -from typing import Optional -from . import __version__ +DATETIME_HEADER = "Date" -MODULE_NAME = "mirror" - -module_version = __version__ +GITHUB_API_URL = "https://api.github.com" +REMAINING_RATELIMIT_HEADER = "X-RateLimit-Remaining" +RESET_RATELIMIT_HEADER = "X-RateLimit-Reset" +GITHUB_API_REQUEST_TIMEOUT = 10 GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") CLONE_DIR = os.environ.get("CLONE_DIR") + MIRROR_CRAWL_INTERVAL_SECONDS = os.environ.get("MIRROR_CRAWL_INTERVAL_SECONDS") MIRROR_CRAWL_MIN_RATE_LIMIT = os.environ.get("MIRROR_CRAWL_MIN_RATE_LIMIT") MIRROR_CRAWL_BATCH_SIZE = os.environ.get("MIRROR_CRAWL_BATCH_SIZE") diff --git a/setup.py b/setup.py index 4fe1256..9c8ede5 100644 --- a/setup.py +++ b/setup.py @@ -45,8 +45,11 @@ "requests", "tqdm", ], - extras_require={ - "dev": ["black", "mypy", "jupyter"] + extras_require={"dev": ["black", "mypy", "jupyter"]}, + entry_points={ + "console_scripts": [ + "{0} = {0}.cli:cli".format(MODULE_NAME), + "{0}-cli = {0}.cli_argp:main".format(MODULE_NAME) + ] }, - entry_points={"console_scripts": ["{0} = {0}.cli:cli".format(MODULE_NAME)]}, )