Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/workflows/black.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: Lint

on: [pull_request]

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
- uses: psf/black@stable
47 changes: 28 additions & 19 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,39 @@ on:
branches:
- master
workflow_dispatch:



jobs:
index:
name: Embed PEP descriptions
runs-on: ubuntu-18.04

update_stats:
runs-on: ubuntu-latest
steps:
- name: Setup Python
uses: actions/setup-python@v4.5.0
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.10

python-version: "3.13"
- name: Install uv
run: pip install uv

- name: Install pepembed
run: |
pip install .
uv pip install pepembed --system

- name: Run
run: |
pepembed \
--postgres-host ${{ secrets.POSTGRES_HOST }} \
--postgres-user ${{ secrets.POSTGRES_USER }} \
--postgres-password ${{ secrets.POSTGRES_PASSWORD }} \
--postgres-db ${{ secrets.POSTGRES_DB }}




pepembed
env:
POSTGRES_HOST: ${{ secrets.POSTGRES_HOST }}
POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }}
POSTGRES_USER: ${{ secrets.POSTGRES_USER }}
POSTGRES_DB: ${{ secrets.POSTGRES_DB }}
QDRANT_HOST: ${{ secrets.QDRANT_HOST }}
QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}

- name: Commit report
run: |
git config --global user.name 'Oleksandr Khoroshevskyi'
git config --global user.email 'khoroshevskyi@users.noreply.github.com'
git add -A
git commit -m "Automated update of PEP embeddings"
git push
39 changes: 39 additions & 0 deletions .github/workflows/run-pytest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
## we can't run test, but lets just install all dependencies and package
name: Installation test

on:
push:
branches: [dev]
pull_request:
branches: [master, dev]

jobs:
pytest:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ["3.10", "3.13"]
os: [ubuntu-latest]

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install uv
run: pip install uv

- name: Install dev dependencies
run: if [ -f requirements/requirements-dev.txt ]; then uv pip install -r requirements/requirements-dev.txt --system; fi

- name: Install package
run: uv pip install . --system

- name: Run help
run: pepembed --help

# - name: Run pytest tests
# run: pytest tests -x -vv
6 changes: 2 additions & 4 deletions pepembed/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
""" Package-level data """
from ._version import __version__
import logmuse
"""Package-level data"""

logmuse.init_logger("geofetch")
from ._version import __version__
35 changes: 35 additions & 0 deletions pepembed/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import logging
import sys
import coloredlogs

from .argparser import app
from .const import PKG_NAME

_LOGGER = logging.getLogger(name=PKG_NAME)
_LOGGER.propagate = False
coloredlogs.install(
logger=_LOGGER,
datefmt="%H:%M:%S",
fmt="[%(levelname)s] [%(asctime)s] [PEPEMBED] %(message)s",
)


# Add console handler to output logs
# handler = logging.StreamHandler()
# handler.setLevel(logging.INFO)
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# handler.setFormatter(formatter)
# _LOGGER.addHandler(handler)


def main():
app(prog_name=PKG_NAME)


if __name__ == "__main__":
try:
main()

except KeyboardInterrupt:
print("Pipeline aborted.")
sys.exit(1)
2 changes: 1 addition & 1 deletion pepembed/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.1"
__version__ = "0.1.0"
218 changes: 92 additions & 126 deletions pepembed/argparser.py
Original file line number Diff line number Diff line change
@@ -1,132 +1,98 @@
from ubiquerg import VersionInHelpParser
import logging
import os
from typing import Optional

from . import __version__
from .const import *
from ._version import __version__ as pepembed_version


def build_argparser():
banner = "%(prog)s - Run embedding on PEPs"
additional_description = "pephub.databio.org"

parser = VersionInHelpParser(
prog=PKG_NAME,
description=banner,
epilog=additional_description,
version=pepembed_version,
)

parser.add_argument(
"--verbosity",
dest="verbosity",
type=int,
choices=range(len(LEVEL_BY_VERBOSITY)),
help="Choose level of verbosity (default: %(default)s)",
)

parser.add_argument(
"--dbg",
dest="dbg",
action="store_true",
help="Enable debug mode (default: %(default)s)",
)

parser.add_argument(
"-m",
"--hf-model",
dest="hf_model",
default="sentence-transformers/all-MiniLM-L12-v2",
help="Huggingface model registry (default: %(default)s)",
)

parser.add_argument(
"--keywords-file",
dest="keywords_file",
default=None,
help="File containing keywords to search for (default: %(default)s)",
)

parser.add_argument(
"--postgres-host",
dest="postgres_host",
default=None,
help="Postgres host (default: %(default)s)",
)

parser.add_argument(
"--postgres-port",
dest="postgres_port",
default=5432,
help="Postgres port (default: %(default)s)",
)
import typer
from dotenv import load_dotenv

parser.add_argument(
"--postgres-user",
dest="postgres_user",
default=None,
help="Postgres user (default: %(default)s)",
)

parser.add_argument(
"--postgres-password",
dest="postgres_password",
default=None,
help="Postgres password (default: %(default)s)",
)

parser.add_argument(
"--postgres-db",
dest="postgres_db",
default=None,
help="Postgres database (default: %(default)s)",
)

parser.add_argument(
"--qdrant-host",
dest="qdrant_host",
default=None,
help="Qdrant host (default: %(default)s)",
)

parser.add_argument(
"--qdrant-port",
dest="qdrant_port",
default=None,
help="Qdrant port (default: %(default)s)",
)

parser.add_argument(
"--qdrant-collection",
dest="qdrant_collection",
default=None,
help="Qdrant collection name (default: %(default)s)",
)
parser.add_argument(
"--qdrant-api-key",
dest="qdrant_api_key",
default=None,
help="Qdrant API key (default: %(default)s)",
)
from ._version import __version__ as pepembed_version
from .const import (
DEFAULT_BATCH_SIZE,
DENSE_ENCODER_MODEL,
PKG_NAME,
QDRANT_DEFAULT_COLLECTION,
SPARSE_ENCODER_MODEL,
)

parser.add_argument(
"--recreate-collection",
dest="recreate_collection",
action="store_true",
help="Recreate collection if it exists (default: %(default)s)",
)
_LOGGER = logging.getLogger(PKG_NAME)

parser.add_argument(
"--batch-size",
dest="batch_size",
default=100,
help="Batch size for embedding (default: %(default)s)",
)
app = typer.Typer(
name=PKG_NAME,
help="Run embedding on PEPs",
epilog="pephub.databio.org",
add_completion=False,
)

parser.add_argument(
"--upsert-batch-size",
dest="upsert_batch_size",
default=1000,
help="Batch size for upserting embeddings into qdrant (default: %(default)s)",
)

return parser
def build_argparser():
"""
Build and return the typer app for CLI argument parsing.
This function maintains compatibility with the original argparse interface.
"""
return app


def version_callback(value: bool):
if value:
typer.echo(f"pepembed version: {pepembed_version}")
raise typer.Exit()


@app.command()
def main(
qdrant_collection: Optional[str] = typer.Option(
None,
help="Qdrant collection name",
),
recreate_collection: bool = typer.Option(
True,
help="Recreate collection if it exists",
),
batch_size: int = typer.Option(
DEFAULT_BATCH_SIZE,
help="Batch size for embedding",
),
dense_model: Optional[str] = typer.Option(
None,
help="HuggingFace dense encoder model",
),
sparse_model: Optional[str] = typer.Option(
None,
help="HuggingFace sparse encoder model",
),
env_var: Optional[str] = typer.Option(
None,
help="Path to .env file, if not set, will not load any .env file",
),
version: bool = typer.Option(
None, "--version", "-v", callback=version_callback, help="App version"
),
):
"""Run embedding on PEPs"""
# Import here to avoid circular imports
from .pepembed import pepembed

if env_var:
load_dotenv(dotenv_path=env_var)

collection_name = qdrant_collection or os.environ.get(
"QDRANT_COLLECTION", QDRANT_DEFAULT_COLLECTION
)
hf_model_dense = dense_model or os.environ.get(
"HF_MODEL_DENSE", DENSE_ENCODER_MODEL
)
hf_model_sparse = sparse_model or os.environ.get(
"HF_MODEL_SPARSE", SPARSE_ENCODER_MODEL
)

pepembed(
batch_size=batch_size,
recreate_collection=recreate_collection,
collection_name=collection_name,
hf_model_dense=hf_model_dense,
hf_model_sparse=hf_model_sparse,
)


if __name__ == "__main__":
app()
Loading
Loading