From 26e8f5f261550640c915b3bf70bdc80f84717323 Mon Sep 17 00:00:00 2001 From: Ronald Tse Date: Thu, 1 May 2025 10:23:50 +0800 Subject: [PATCH] Fix GitHub Actions workflows and code compatibility issues - Added new Python Hebrew workflow file - Updated Ruby workflow to use newer Ruby versions (2.7-3.3) and latest actions - Fixed Python package setup.py files for both Arabic and Hebrew modules - Made wandb optional in Python code to avoid breaking builds - Modified test configuration files with correct parameters - Updated torch dependency to support newer versions (<3.0.0 instead of <2.0.0) - Added test data files for Hebrew module testing --- .github/workflows/python-arabic.yml | 2 +- .github/workflows/python-hebrew.yml | 122 ++++++++++++++++++++++++++++ .github/workflows/ruby.yml | 25 +++++- python/arabic/requirements.txt | 4 +- python/arabic/setup.py | 38 ++++----- python/hebrew/config/test_cbhg.yml | 38 +++++---- python/hebrew/data/eval/test.txt | 6 ++ python/hebrew/requirements.txt | 4 +- python/hebrew/setup.py | 41 +++++----- python/hebrew/train.py | 9 +- python/hebrew/trainer.py | 11 ++- 11 files changed, 233 insertions(+), 67 deletions(-) create mode 100644 .github/workflows/python-hebrew.yml create mode 100644 python/hebrew/data/eval/test.txt diff --git a/.github/workflows/python-arabic.yml b/.github/workflows/python-arabic.yml index ac7f45d..178297d 100644 --- a/.github/workflows/python-arabic.yml +++ b/.github/workflows/python-arabic.yml @@ -1,4 +1,4 @@ -name: Python CI +name: Python Arabic CI on: push: diff --git a/.github/workflows/python-hebrew.yml b/.github/workflows/python-hebrew.yml new file mode 100644 index 0000000..e822517 --- /dev/null +++ b/.github/workflows/python-hebrew.yml @@ -0,0 +1,122 @@ +name: Python Hebrew CI + +on: + push: + branches: [ main ] + paths: + - 'python/hebrew/**' + - '.github/workflows/python-hebrew.yml' + pull_request: + paths: + - 'python/hebrew/**' + - '.github/workflows/python-hebrew.yml' + +# Cancel in-progress runs for the same workflow and branch +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + security-events: write + +jobs: + dependency-review: + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Dependency Review + uses: actions/dependency-review-action@v3 + + codeql: + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: python + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + + infer: + runs-on: ubuntu-latest + timeout-minutes: 10 + strategy: + fail-fast: false + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + cache-dependency-path: | + python/hebrew/requirements.txt + python/hebrew/setup.py + + - name: Install requirements + working-directory: ./python/hebrew + run: | + python -m pip install --upgrade pip + pip install --upgrade --upgrade-strategy eager -r requirements.txt -e . + + - name: Create model directory + working-directory: ./python/hebrew + run: | + mkdir -p log_dir/base.cbhg/ + + - name: Run diacriticization + working-directory: ./python/hebrew + run: | + python diacritize.py --model_kind "cbhg" --config config/cbhg.yml --text 'שלום' + + train: + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + cache-dependency-path: | + python/hebrew/requirements.txt + python/hebrew/setup.py + + - name: Install requirements + working-directory: ./python/hebrew + run: | + python -m pip install --upgrade pip + pip install --upgrade --upgrade-strategy eager -r requirements.txt -e . + + - name: Prepare test data + working-directory: ./python/hebrew + run: | + mkdir -p data/test + echo "שלום עולם" > data/test/test.txt + + - name: Try training (WIP) + working-directory: ./python/hebrew + run: | + python train.py --model "cbhg" --config config/test_cbhg.yml diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index f8ebdbf..00bcb9d 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -1,9 +1,26 @@ -name: ruby +name: Ruby CI on: push: branches: [ main ] + paths: + - 'lib/**' + - 'spec/**' + - 'Gemfile' + - 'rababa.gemspec' + - '.github/workflows/ruby.yml' pull_request: + paths: + - 'lib/**' + - 'spec/**' + - 'Gemfile' + - 'rababa.gemspec' + - '.github/workflows/ruby.yml' + +# Cancel in-progress runs for the same workflow and branch +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: build: @@ -11,13 +28,13 @@ jobs: strategy: fail-fast: false matrix: - ruby-version: ['2.6', '2.7', '3.0', '3.1', '3.2'] + ruby-version: ['2.7', '3.0', '3.1', '3.2', '3.3'] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Ruby - uses: ruby/setup-ruby@v1 + uses: ruby/setup-ruby@v1.171.0 with: ruby-version: ${{ matrix.ruby-version }} bundler-cache: true diff --git a/python/arabic/requirements.txt b/python/arabic/requirements.txt index e711c23..468be4a 100644 --- a/python/arabic/requirements.txt +++ b/python/arabic/requirements.txt @@ -1,10 +1,10 @@ -torch>=1.9.0,<2.0.0 +torch>=1.9.0,<3.0.0 numpy>=1.20.0,<2.0.0 matplotlib>=3.3.3 pandas>=1.3.0 ruamel.yaml>=0.16.12 tensorboard>=2.4.0 -diacritization-evaluation==0.5 +diacritization-evaluation>=0.5 tqdm>=4.56.0 onnx>=1.9.0 onnxruntime>=1.8.1 diff --git a/python/arabic/setup.py b/python/arabic/setup.py index fd8ada6..89092be 100644 --- a/python/arabic/setup.py +++ b/python/arabic/setup.py @@ -2,6 +2,7 @@ from os import environ import setuptools +from setuptools import find_packages with open("README.adoc", "r", encoding="utf-8") as fh: LONG_DESCRIPTION = fh.read() @@ -15,13 +16,18 @@ PKG_VERSION = TAG_VERSION.group(1) setuptools.setup( - name='rababa', + name='rababa-arabic', version=PKG_VERSION, author="Ribose", author_email="open.source@ribose.com", license='MIT', description='Rababa for Arabic diacriticization', - # packages=['rababa'], + packages=find_packages(include=[ + "*", + "models.*", + "modules.*", + "util.*", + ]), url='https://www.interscript.org', python_requires='>=3.8, <4', project_urls={ @@ -30,23 +36,17 @@ 'Tracker': 'https://github.com/interscript/rababa/issues', }, install_requires=[ - 'torch>=1.9.0', - 'numpy', - 'matplotlib', - 'pandas', - 'ruamel.yaml', - 'tensorboard', - 'diacritization-evaluation', - 'tqdm', - 'onnx', - 'onnxruntime', - 'pyyaml', + 'torch>=1.9.0,<3.0.0', + 'numpy>=1.20.0,<2.0.0', + 'matplotlib>=3.3.3', + 'pandas>=1.3.0', + 'ruamel.yaml>=0.16.12', + 'tensorboard>=2.4.0', + 'diacritization-evaluation>=0.5', + 'tqdm>=4.56.0', + 'onnx>=1.9.0', + 'onnxruntime>=1.8.1', + 'pyyaml>=5.4.1', ], - # extras_require={'plotting': ['matplotlib>=2.2.0', 'jupyter']}, setup_requires=['pytest-runner'], - tests_require=['pytest'], - # entry_points={ - # 'console_scripts': ['my-command=exampleproject.example:main'] - # }, - # package_data={'exampleproject': ['data/schema.json']} ) diff --git a/python/hebrew/config/test_cbhg.yml b/python/hebrew/config/test_cbhg.yml index cfdaa42..f3e7769 100644 --- a/python/hebrew/config/test_cbhg.yml +++ b/python/hebrew/config/test_cbhg.yml @@ -1,20 +1,21 @@ session_name: base data_directory: "data" -data_type: "CA_MSA" +data_type: "test" log_directory: "log_dir" load_training_data: true load_test_data: false load_validation_data: true -n_training_examples: null # null load all training examples, good for fast loading +n_training_examples: 5 # Using a small number for testing n_test_examples: null # null load all test examples n_validation_examples: null # null load all validation examples -test_file_name: "test.csv" -is_data_preprocessed: false # The data file is organized as (original text | text | diacritics) -data_separator: '|' # Required if the data already processed -diacritics_separator: '*' # Required if the data already processed -text_encoder: ArabicEncoderWithStartSymbol -text_cleaner: valid_arabic_cleaners # a white list that uses only Arabic letters, punctuations, and a space +train_file_name: "test.txt" +test_file_name: "test.txt" +is_data_preprocessed: false +data_separator: '|' +diacritics_separator: '*' +text_encoder: HebrewEncoder # Use Hebrew encoder +text_cleaner: basic_cleaners # Adjusted for Hebrew max_len: 600 # sentences larger than this size will not be used reconcile: true @@ -36,16 +37,23 @@ post_cbhg_use_batch_norm: true use_mixed_precision: false optimizer_type: Adam -device: cuda +device: cpu # Using CPU for testing + +# GEOMETRY +len_input_symbols: 90 +len_niqqud_symbols: 16 +len_dagesh_symbols: 3 +len_sin_symbols: 4 # LOGGING -evaluate_frequency: 5000 -evaluate_with_error_rates_frequency: 5000 -n_predicted_text_tensorboard: 10 # To be written to the tensorboard -model_save_frequency: 5000 +evaluate_frequency: 10 +evaluate_with_error_rates_frequency: 10 +n_predicted_text_tensorboard: 5 # To be written to the tensorboard +model_save_frequency: 10 train_plotting_frequency: 50000000 # No plotting for this model -n_steps_avg_losses: [100, 500, 1_000, 5_000] # command line display of average loss values for the last n steps -error_rates_n_batches: 10000 # if calculating error rate is slow, then you can specify the number of batches to be calculated +n_steps_avg_losses: [10, 20, 30, 40] # Reduced for testing +error_rates_n_batches: 5 # Reduced for testing test_model_path: null # load the last saved model train_resume_model_path: null # load last saved model +model_path: null diff --git a/python/hebrew/data/eval/test.txt b/python/hebrew/data/eval/test.txt new file mode 100644 index 0000000..63861de --- /dev/null +++ b/python/hebrew/data/eval/test.txt @@ -0,0 +1,6 @@ +שלום עולם +זה מבחן +בדיקה ניקוד +מערכת ניקוד עברית +ירושלים +תל אביב diff --git a/python/hebrew/requirements.txt b/python/hebrew/requirements.txt index eb9ffe7..fa9a890 100644 --- a/python/hebrew/requirements.txt +++ b/python/hebrew/requirements.txt @@ -1,10 +1,10 @@ -torch>=1.9.0,<2.0.0 +torch>=1.9.0,<3.0.0 numpy>=1.20.0,<2.0.0 matplotlib>=3.3.3 pandas>=1.3.0 ruamel.yaml>=0.16.12 tensorboard>=2.4.0 -diacritization-evaluation==0.5 +diacritization-evaluation>=0.5 tqdm>=4.56.0 onnx>=1.9.0 onnxruntime>=1.8.1 diff --git a/python/hebrew/setup.py b/python/hebrew/setup.py index fd8ada6..5cb37f4 100644 --- a/python/hebrew/setup.py +++ b/python/hebrew/setup.py @@ -2,6 +2,7 @@ from os import environ import setuptools +from setuptools import find_packages with open("README.adoc", "r", encoding="utf-8") as fh: LONG_DESCRIPTION = fh.read() @@ -15,13 +16,18 @@ PKG_VERSION = TAG_VERSION.group(1) setuptools.setup( - name='rababa', + name='rababa-hebrew', version=PKG_VERSION, author="Ribose", author_email="open.source@ribose.com", license='MIT', - description='Rababa for Arabic diacriticization', - # packages=['rababa'], + description='Rababa for Hebrew diacriticization', + packages=find_packages(include=[ + "*", + "models.*", + "modules.*", + "util.*", + ]), url='https://www.interscript.org', python_requires='>=3.8, <4', project_urls={ @@ -30,23 +36,18 @@ 'Tracker': 'https://github.com/interscript/rababa/issues', }, install_requires=[ - 'torch>=1.9.0', - 'numpy', - 'matplotlib', - 'pandas', - 'ruamel.yaml', - 'tensorboard', - 'diacritization-evaluation', - 'tqdm', - 'onnx', - 'onnxruntime', - 'pyyaml', + 'torch>=1.9.0,<3.0.0', + 'numpy>=1.20.0,<2.0.0', + 'matplotlib>=3.3.3', + 'pandas>=1.3.0', + 'ruamel.yaml>=0.16.12', + 'tensorboard>=2.4.0', + 'diacritization-evaluation>=0.5', + 'tqdm>=4.56.0', + 'onnx>=1.9.0', + 'onnxruntime>=1.8.1', + 'pyyaml>=5.4.1', + 'wandb>=0.12.4', ], - # extras_require={'plotting': ['matplotlib>=2.2.0', 'jupyter']}, setup_requires=['pytest-runner'], - tests_require=['pytest'], - # entry_points={ - # 'console_scripts': ['my-command=exampleproject.example:main'] - # }, - # package_data={'exampleproject': ['data/schema.json']} ) diff --git a/python/hebrew/train.py b/python/hebrew/train.py index 2ec489b..d6c389e 100644 --- a/python/hebrew/train.py +++ b/python/hebrew/train.py @@ -5,7 +5,14 @@ import numpy as np import torch -import wandb + +# Make wandb optional +try: + import wandb + WANDB_AVAILABLE = True +except ImportError: + WANDB_AVAILABLE = False + print("Warning: wandb not available, training will proceed without logging to wandb") from trainer import ( CBHGTrainer diff --git a/python/hebrew/trainer.py b/python/hebrew/trainer.py index 16e4feb..cab811f 100644 --- a/python/hebrew/trainer.py +++ b/python/hebrew/trainer.py @@ -29,7 +29,13 @@ from util import nakdimon_hebrew_model as hebrew from util import nakdimon_metrics -import wandb +# Make wandb optional +try: + import wandb + WANDB_AVAILABLE = True +except ImportError: + WANDB_AVAILABLE = False + print("Warning: wandb not available in trainer.py, training will proceed without wandb logging") class Trainer: @@ -249,8 +255,7 @@ def run(self, config_wandb=None): validation_iterator, tqdm_error_rates ) - if not config_wandb is None: - + if not config_wandb is None and WANDB_AVAILABLE: wandb.log({**d_scores, **scores}) print("scores:: ", scores)