From 26e8f5f261550640c915b3bf70bdc80f84717323 Mon Sep 17 00:00:00 2001
From: Ronald Tse <ronald.tse@ribose.com>
Date: Thu, 1 May 2025 10:23:50 +0800
Subject: [PATCH] Fix GitHub Actions workflows and code compatibility issues

- Added new Python Hebrew workflow file
- Updated Ruby workflow to use newer Ruby versions (2.7-3.3) and latest actions
- Fixed Python package setup.py files for both Arabic and Hebrew modules
- Made wandb optional in Python code to avoid breaking builds
- Modified test configuration files with correct parameters
- Updated torch dependency to support newer versions (<3.0.0 instead of <2.0.0)
- Added test data files for Hebrew module testing
---
 .github/workflows/python-arabic.yml |   2 +-
 .github/workflows/python-hebrew.yml | 122 ++++++++++++++++++++++++++++
 .github/workflows/ruby.yml          |  25 +++++-
 python/arabic/requirements.txt      |   4 +-
 python/arabic/setup.py              |  38 ++++-----
 python/hebrew/config/test_cbhg.yml  |  38 +++++----
 python/hebrew/data/eval/test.txt    |   6 ++
 python/hebrew/requirements.txt      |   4 +-
 python/hebrew/setup.py              |  41 +++++-----
 python/hebrew/train.py              |   9 +-
 python/hebrew/trainer.py            |  11 ++-
 11 files changed, 233 insertions(+), 67 deletions(-)
 create mode 100644 .github/workflows/python-hebrew.yml
 create mode 100644 python/hebrew/data/eval/test.txt

diff --git a/.github/workflows/python-arabic.yml b/.github/workflows/python-arabic.yml
index ac7f45d..178297d 100644
--- a/.github/workflows/python-arabic.yml
+++ b/.github/workflows/python-arabic.yml
@@ -1,4 +1,4 @@
-name: Python CI
+name: Python Arabic CI
 
 on:
   push:
diff --git a/.github/workflows/python-hebrew.yml b/.github/workflows/python-hebrew.yml
new file mode 100644
index 0000000..e822517
--- /dev/null
+++ b/.github/workflows/python-hebrew.yml
@@ -0,0 +1,122 @@
+name: Python Hebrew CI
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'python/hebrew/**'
+      - '.github/workflows/python-hebrew.yml'
+  pull_request:
+    paths:
+      - 'python/hebrew/**'
+      - '.github/workflows/python-hebrew.yml'
+
+# Cancel in-progress runs for the same workflow and branch
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  security-events: write
+
+jobs:
+  dependency-review:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'pull_request'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Dependency Review
+        uses: actions/dependency-review-action@v3
+
+  codeql:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v2
+        with:
+          languages: python
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v2
+
+  infer:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+        cache-dependency-path: |
+          python/hebrew/requirements.txt
+          python/hebrew/setup.py
+
+    - name: Install requirements
+      working-directory: ./python/hebrew
+      run: |
+        python -m pip install --upgrade pip
+        pip install --upgrade --upgrade-strategy eager -r requirements.txt -e .
+
+    - name: Create model directory
+      working-directory: ./python/hebrew
+      run: |
+        mkdir -p log_dir/base.cbhg/
+
+    - name: Run diacriticization
+      working-directory: ./python/hebrew
+      run: |
+        python diacritize.py --model_kind "cbhg" --config config/cbhg.yml --text 'שלום'
+
+  train:
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+        cache-dependency-path: |
+          python/hebrew/requirements.txt
+          python/hebrew/setup.py
+
+    - name: Install requirements
+      working-directory: ./python/hebrew
+      run: |
+        python -m pip install --upgrade pip
+        pip install --upgrade --upgrade-strategy eager -r requirements.txt -e .
+
+    - name: Prepare test data
+      working-directory: ./python/hebrew
+      run: |
+        mkdir -p data/test
+        echo "שלום עולם" > data/test/test.txt
+
+    - name: Try training (WIP)
+      working-directory: ./python/hebrew
+      run: |
+        python train.py --model "cbhg" --config config/test_cbhg.yml
diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml
index f8ebdbf..00bcb9d 100644
--- a/.github/workflows/ruby.yml
+++ b/.github/workflows/ruby.yml
@@ -1,9 +1,26 @@
-name: ruby
+name: Ruby CI
 
 on:
   push:
     branches: [ main ]
+    paths:
+      - 'lib/**'
+      - 'spec/**'
+      - 'Gemfile'
+      - 'rababa.gemspec'
+      - '.github/workflows/ruby.yml'
   pull_request:
+    paths:
+      - 'lib/**'
+      - 'spec/**'
+      - 'Gemfile'
+      - 'rababa.gemspec'
+      - '.github/workflows/ruby.yml'
+
+# Cancel in-progress runs for the same workflow and branch
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
 
 jobs:
   build:
@@ -11,13 +28,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        ruby-version: ['2.6', '2.7', '3.0', '3.1', '3.2']
+        ruby-version: ['2.7', '3.0', '3.1', '3.2', '3.3']
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Set up Ruby
-      uses: ruby/setup-ruby@v1
+      uses: ruby/setup-ruby@v1.171.0
       with:
         ruby-version: ${{ matrix.ruby-version }}
         bundler-cache: true
diff --git a/python/arabic/requirements.txt b/python/arabic/requirements.txt
index e711c23..468be4a 100644
--- a/python/arabic/requirements.txt
+++ b/python/arabic/requirements.txt
@@ -1,10 +1,10 @@
-torch>=1.9.0,<2.0.0
+torch>=1.9.0,<3.0.0
 numpy>=1.20.0,<2.0.0
 matplotlib>=3.3.3
 pandas>=1.3.0
 ruamel.yaml>=0.16.12
 tensorboard>=2.4.0
-diacritization-evaluation==0.5
+diacritization-evaluation>=0.5
 tqdm>=4.56.0
 onnx>=1.9.0
 onnxruntime>=1.8.1
diff --git a/python/arabic/setup.py b/python/arabic/setup.py
index fd8ada6..89092be 100644
--- a/python/arabic/setup.py
+++ b/python/arabic/setup.py
@@ -2,6 +2,7 @@
 from os import environ
 
 import setuptools
+from setuptools import find_packages
 
 with open("README.adoc", "r", encoding="utf-8") as fh:
     LONG_DESCRIPTION = fh.read()
@@ -15,13 +16,18 @@
     PKG_VERSION = TAG_VERSION.group(1)
 
 setuptools.setup(
-    name='rababa',
+    name='rababa-arabic',
     version=PKG_VERSION,
     author="Ribose",
     author_email="open.source@ribose.com",
     license='MIT',
     description='Rababa for Arabic diacriticization',
-    # packages=['rababa'],
+    packages=find_packages(include=[
+        "*",
+        "models.*",
+        "modules.*",
+        "util.*",
+    ]),
     url='https://www.interscript.org',
     python_requires='>=3.8, <4',
     project_urls={
@@ -30,23 +36,17 @@
         'Tracker': 'https://github.com/interscript/rababa/issues',
     },
     install_requires=[
-      'torch>=1.9.0',
-      'numpy',
-      'matplotlib',
-      'pandas',
-      'ruamel.yaml',
-      'tensorboard',
-      'diacritization-evaluation',
-      'tqdm',
-      'onnx',
-      'onnxruntime',
-      'pyyaml',
+      'torch>=1.9.0,<3.0.0',
+      'numpy>=1.20.0,<2.0.0',
+      'matplotlib>=3.3.3',
+      'pandas>=1.3.0',
+      'ruamel.yaml>=0.16.12',
+      'tensorboard>=2.4.0',
+      'diacritization-evaluation>=0.5',
+      'tqdm>=4.56.0',
+      'onnx>=1.9.0',
+      'onnxruntime>=1.8.1',
+      'pyyaml>=5.4.1',
     ],
-    # extras_require={'plotting': ['matplotlib>=2.2.0', 'jupyter']},
     setup_requires=['pytest-runner'],
-    tests_require=['pytest'],
-    # entry_points={
-    #     'console_scripts': ['my-command=exampleproject.example:main']
-    # },
-    # package_data={'exampleproject': ['data/schema.json']}
 )
diff --git a/python/hebrew/config/test_cbhg.yml b/python/hebrew/config/test_cbhg.yml
index cfdaa42..f3e7769 100644
--- a/python/hebrew/config/test_cbhg.yml
+++ b/python/hebrew/config/test_cbhg.yml
@@ -1,20 +1,21 @@
 session_name: base
 
 data_directory: "data"
-data_type: "CA_MSA"
+data_type: "test"
 log_directory: "log_dir"
 load_training_data: true
 load_test_data: false
 load_validation_data: true
-n_training_examples: null # null load all training examples, good for fast loading
+n_training_examples: 5 # Using a small number for testing
 n_test_examples: null  # null load all test examples
 n_validation_examples: null # null load all validation examples
-test_file_name: "test.csv"
-is_data_preprocessed: false # The data file is organized as (original text | text | diacritics)
-data_separator: '|' # Required if the data already processed
-diacritics_separator: '*'  # Required if the data already processed
-text_encoder: ArabicEncoderWithStartSymbol
-text_cleaner: valid_arabic_cleaners # a white list that uses only Arabic letters, punctuations, and a space
+train_file_name: "test.txt"
+test_file_name: "test.txt"
+is_data_preprocessed: false
+data_separator: '|'
+diacritics_separator: '*'
+text_encoder: HebrewEncoder # Use Hebrew encoder
+text_cleaner: basic_cleaners # Adjusted for Hebrew
 max_len: 600 # sentences larger than this size will not be used
 reconcile: true
 
@@ -36,16 +37,23 @@ post_cbhg_use_batch_norm: true
 
 use_mixed_precision: false
 optimizer_type: Adam
-device: cuda
+device: cpu  # Using CPU for testing
+
+# GEOMETRY
+len_input_symbols: 90
+len_niqqud_symbols: 16
+len_dagesh_symbols: 3
+len_sin_symbols: 4
 
 # LOGGING
-evaluate_frequency: 5000
-evaluate_with_error_rates_frequency: 5000
-n_predicted_text_tensorboard: 10 # To be written to the tensorboard
-model_save_frequency: 5000
+evaluate_frequency: 10
+evaluate_with_error_rates_frequency: 10
+n_predicted_text_tensorboard: 5 # To be written to the tensorboard
+model_save_frequency: 10
 train_plotting_frequency: 50000000 # No plotting for this model
-n_steps_avg_losses: [100, 500, 1_000, 5_000] # command line display of average loss values for the last n steps
-error_rates_n_batches: 10000 # if calculating error rate is slow, then you can specify the number of batches to be calculated
+n_steps_avg_losses: [10, 20, 30, 40] # Reduced for testing
+error_rates_n_batches: 5 # Reduced for testing
 
 test_model_path: null # load the last saved model
 train_resume_model_path: null # load last saved model
+model_path: null
diff --git a/python/hebrew/data/eval/test.txt b/python/hebrew/data/eval/test.txt
new file mode 100644
index 0000000..63861de
--- /dev/null
+++ b/python/hebrew/data/eval/test.txt
@@ -0,0 +1,6 @@
+שלום עולם
+זה מבחן
+בדיקה ניקוד
+מערכת ניקוד עברית
+ירושלים
+תל אביב
diff --git a/python/hebrew/requirements.txt b/python/hebrew/requirements.txt
index eb9ffe7..fa9a890 100644
--- a/python/hebrew/requirements.txt
+++ b/python/hebrew/requirements.txt
@@ -1,10 +1,10 @@
-torch>=1.9.0,<2.0.0
+torch>=1.9.0,<3.0.0
 numpy>=1.20.0,<2.0.0
 matplotlib>=3.3.3
 pandas>=1.3.0
 ruamel.yaml>=0.16.12
 tensorboard>=2.4.0
-diacritization-evaluation==0.5
+diacritization-evaluation>=0.5
 tqdm>=4.56.0
 onnx>=1.9.0
 onnxruntime>=1.8.1
diff --git a/python/hebrew/setup.py b/python/hebrew/setup.py
index fd8ada6..5cb37f4 100644
--- a/python/hebrew/setup.py
+++ b/python/hebrew/setup.py
@@ -2,6 +2,7 @@
 from os import environ
 
 import setuptools
+from setuptools import find_packages
 
 with open("README.adoc", "r", encoding="utf-8") as fh:
     LONG_DESCRIPTION = fh.read()
@@ -15,13 +16,18 @@
     PKG_VERSION = TAG_VERSION.group(1)
 
 setuptools.setup(
-    name='rababa',
+    name='rababa-hebrew',
     version=PKG_VERSION,
     author="Ribose",
     author_email="open.source@ribose.com",
     license='MIT',
-    description='Rababa for Arabic diacriticization',
-    # packages=['rababa'],
+    description='Rababa for Hebrew diacriticization',
+    packages=find_packages(include=[
+        "*",
+        "models.*",
+        "modules.*",
+        "util.*",
+    ]),
     url='https://www.interscript.org',
     python_requires='>=3.8, <4',
     project_urls={
@@ -30,23 +36,18 @@
         'Tracker': 'https://github.com/interscript/rababa/issues',
     },
     install_requires=[
-      'torch>=1.9.0',
-      'numpy',
-      'matplotlib',
-      'pandas',
-      'ruamel.yaml',
-      'tensorboard',
-      'diacritization-evaluation',
-      'tqdm',
-      'onnx',
-      'onnxruntime',
-      'pyyaml',
+      'torch>=1.9.0,<3.0.0',
+      'numpy>=1.20.0,<2.0.0',
+      'matplotlib>=3.3.3',
+      'pandas>=1.3.0',
+      'ruamel.yaml>=0.16.12',
+      'tensorboard>=2.4.0',
+      'diacritization-evaluation>=0.5',
+      'tqdm>=4.56.0',
+      'onnx>=1.9.0',
+      'onnxruntime>=1.8.1',
+      'pyyaml>=5.4.1',
+      'wandb>=0.12.4',
     ],
-    # extras_require={'plotting': ['matplotlib>=2.2.0', 'jupyter']},
     setup_requires=['pytest-runner'],
-    tests_require=['pytest'],
-    # entry_points={
-    #     'console_scripts': ['my-command=exampleproject.example:main']
-    # },
-    # package_data={'exampleproject': ['data/schema.json']}
 )
diff --git a/python/hebrew/train.py b/python/hebrew/train.py
index 2ec489b..d6c389e 100644
--- a/python/hebrew/train.py
+++ b/python/hebrew/train.py
@@ -5,7 +5,14 @@
 
 import numpy as np
 import torch
-import wandb
+
+# Make wandb optional
+try:
+    import wandb
+    WANDB_AVAILABLE = True
+except ImportError:
+    WANDB_AVAILABLE = False
+    print("Warning: wandb not available, training will proceed without logging to wandb")
 
 from trainer import (
     CBHGTrainer
diff --git a/python/hebrew/trainer.py b/python/hebrew/trainer.py
index 16e4feb..cab811f 100644
--- a/python/hebrew/trainer.py
+++ b/python/hebrew/trainer.py
@@ -29,7 +29,13 @@
 from util import nakdimon_hebrew_model as hebrew
 from util import nakdimon_metrics
 
-import wandb
+# Make wandb optional
+try:
+    import wandb
+    WANDB_AVAILABLE = True
+except ImportError:
+    WANDB_AVAILABLE = False
+    print("Warning: wandb not available in trainer.py, training will proceed without wandb logging")
 
 
 class Trainer:
@@ -249,8 +255,7 @@ def run(self, config_wandb=None):
                     validation_iterator, tqdm_error_rates
                 )
 
-                if not config_wandb is None:
-
+                if not config_wandb is None and WANDB_AVAILABLE:
                     wandb.log({**d_scores, **scores})
                     print("scores:: ", scores)