From 50907c9932b4214ba64317ceebd59c07279b2364 Mon Sep 17 00:00:00 2001
From: Paul Landes <landes@mailc.net>
Date: Thu, 22 May 2025 15:38:36 -0500
Subject: [PATCH 1/6] fix file mode of a non-executable text file

---
 requirements.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
old mode 100755
new mode 100644

From b831a8bd29f7aae136d0bd7f9b570f55dad9da36 Mon Sep 17 00:00:00 2001
From: Paul Landes <landes@mailc.net>
Date: Thu, 22 May 2025 15:38:53 -0500
Subject: [PATCH 2/6] add basic NLP scoring methods package, unit tests, and
 automation

---
 makefile                  |  31 +++
 pyhealth/nlp/metrics.py   | 533 ++++++++++++++++++++++++++++++++++++++
 requirements-nlp.txt      |   5 +
 tests/base.py             |  15 ++
 tests/nlp/__init__.py     |   0
 tests/nlp/test_metrics.py |  50 ++++
 6 files changed, 634 insertions(+)
 create mode 100644 makefile
 create mode 100644 pyhealth/nlp/metrics.py
 create mode 100644 requirements-nlp.txt
 create mode 100644 tests/base.py
 create mode 100644 tests/nlp/__init__.py
 create mode 100644 tests/nlp/test_metrics.py

diff --git a/makefile b/makefile
new file mode 100644
index 000000000..a84670d1e
--- /dev/null
+++ b/makefile
@@ -0,0 +1,31 @@
+#@meta {author: "Paul Landes"}
+#@meta {desc: "PyHealth build automation", date: "2025-05-22"}
+
+
+## Build
+#
+# directory with the unit tests
+PY_TEST_DIR ?=		tests
+# test file glob pattern
+PY_TEST_GLOB ?=		test_metrics.py
+
+
+## Targets
+#
+# install dependencies
+.PHONY:			deps
+deps:
+			pip install -r requirements-nlp.txt
+
+# run the unit test cases
+.PHONY:			test
+test:
+			@echo "Running tests in $(PY_TEST_DIR)/$(PY_TEST_GLOB)"
+			python -m unittest discover \
+				-s $(PY_TEST_DIR) -p '$(PY_TEST_GLOB)' -v
+
+# clean derived objects
+.PHONY:			clean
+clean:
+			@echo "removing __pycache__"
+			@find . -type d -name __pycache__ -prune -exec rm -r {} \;
diff --git a/pyhealth/nlp/metrics.py b/pyhealth/nlp/metrics.py
new file mode 100644
index 000000000..667a6665b
--- /dev/null
+++ b/pyhealth/nlp/metrics.py
@@ -0,0 +1,533 @@
+"""Support for automatic scoring metrics.  Cannibalized from the `Zensols
+scoring module`_.
+
+.. _Zensols Scoring Module: https://plandes.github.io/nlparse/api/zensols.nlp.html#zensols-nlp-score
+
+"""
+from __future__ import annotations
+__author__ = 'Paul Landes'
+from typing import (
+    List, Tuple, Dict, Set, Optional, Iterable, Type, Union, ClassVar
+)
+from types import ModuleType
+import logging
+from dataclasses import dataclass, field
+import dataclasses
+from abc import ABCMeta, abstractmethod
+import sys
+import os
+from functools import reduce
+import re
+import numpy as np
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+class ScorerError(Exception):
+    """Raised for any scoring errors (this module)."""
+    pass
+
+
+@dataclass
+class Score(metaclass=ABCMeta):
+    """Individual scores returned from :class:`.ScoreMethod`.
+
+    """
+    def asrow(self, meth: str) -> Dict[str, float]:
+        return {f'{meth}_{x[0]}': x[1] for x in dataclasses.asdict(self).items()}
+
+
+@dataclass(eq=False)
+class ErrorScore(Score):
+    """A replacement instance when scoring fails from a raised exception.
+
+    """
+    method: str = field(repr=False)
+    """The method of the :class:`.ScoreMethod` that raised the exception."""
+
+    exception: Exception = field()
+    """The exception that was raised."""
+
+    replace_score: Score = field(default=None)
+    """The score to use in place of this score.  Otherwise :meth:`asrow` return
+    a single :obj:`numpy.nan` like :class:`.FloatScore`.
+
+    """
+    def asrow(self, meth: str) -> Dict[str, float]:
+        if self.replace_score is not None:
+            return self.replace_score.asrow(self.method)
+        return {self.method: np.nan}
+
+    def __eq___(self, other) -> bool:
+        return self.method == other.method and \
+            str(self.exception) == str(other.exeption)
+
+
+@dataclass
+class FloatScore(Score):
+    """Float container.  This is needed to create the flat result container
+    structure.  Object creation becomes less import since most clients will use
+    :meth:`.ScoreSet.asnumpy`.
+
+    """
+    NAN_INSTANCE: ClassVar[FloatScore] = None
+    """Used to add to ErrorScore for harmonic means replacements.
+
+    """
+
+    value: float = field()
+    """The value of score."""
+
+    def asrow(self, meth: str) -> Dict[str, float]:
+        return {meth: self.value}
+
+
+FloatScore.NAN_INSTANCE = FloatScore(np.nan)
+
+
+@dataclass
+class HarmonicMeanScore(Score):
+    """A score having a precision, recall and the harmonic mean of the two,
+    F-score.'
+
+    """
+    NAN_INSTANCE: ClassVar[HarmonicMeanScore] = None
+    """Used to add to ErrorScore for harmonic means replacements.
+
+    """
+    precision: float = field()
+    recall: float = field()
+    f_score: float = field()
+
+
+HarmonicMeanScore.NAN_INSTANCE = HarmonicMeanScore(np.nan, np.nan, np.nan)
+
+
+@dataclass
+class ScoreResult(object):
+    """A result of scores created by a :class:`.ScoreMethod`.
+
+    """
+    scores: Dict[str, Tuple[Score, ...]] = field()
+    """The scores by method name."""
+
+    correlation_id: Optional[str] = field(default=None)
+    """An ID for correlating back to the sentence"""
+
+    def __len__(self) -> int:
+        return len(self.scores)
+
+    def __getitem__(self, k: str) -> Dict[str, Tuple[Score, ...]]:
+        return self.scores[k]
+
+
+@dataclass
+class ScoreContext(object):
+    """Input needed to create score(s) using :class:`.Scorer`.
+
+    """
+    pairs: Tuple[Tuple[str, str]] = field()
+    """Sentence, span or document pairs to score (order matters for some scoring
+    methods such as rouge).  Depending on the scoring method the ordering of the
+    sentence pairs should be:
+
+      * ``(<summary>, <source>)``
+
+      * ``(<gold>, <prediction>)``
+
+      * ``(<references>, <candidates>)``
+
+    See :class:`.ScoreMethod` implementations for more information about pair
+    ordering.
+
+    """
+    methods: Set[str] = field(default=None)
+    """A set of strings, each indicating the :class:`.ScoreMethod` used to score
+    :obj:`pairs`.
+
+    """
+    correlation_ids: Tuple[Union[int, str]] = field(default=None)
+    """The IDs to correlate with each sentence pair, or ``None`` to skip
+    correlating them.  The length of this tuple must be that of :obj:`pairs`.
+
+    """
+    def __post_init__(self):
+        self.validate()
+
+    def validate(self):
+        if self.correlation_ids is not None and \
+           len(self.pairs) != len(self.correlation_ids):
+            raise ScorerError(
+                'Expecting same length pairs to correlation IDs but got: ' +
+                f'{len(self.pairs)} != {len(self.correlation_ids)}')
+
+
+@dataclass
+class ScoreSet(object):
+    """All scores returned from :class:`.Scorer'.
+
+    """
+    results: Tuple[ScoreResult, ...] = field()
+    """A tuple with each element having the results of the respective sentence
+    pair in :obj:`.ScoreContext.sents`.  Each elemnt is a dictionary with the
+    method are the keys with results as the values as output of the
+    :class:`.ScoreMethod`.  This is created in :class:`.Scorer`.
+
+    """
+    correlation_id_col: str = field(default='id')
+    """The column name for the :obj:`.ScoreResult.correlation_id` added to Numpy
+    arrays and Pandas dataframes.  If ``None``, then the correlation IDS are
+    used as the index.
+
+    """
+    def __len__(self) -> int:
+        return len(self.results)
+
+    def __iter__(self) -> Iterable[Dict[str, Tuple[Score, ...]]]:
+        return iter(self.results)
+
+    def __getitem__(self, i: int) -> Dict[str, Tuple[Score, ...]]:
+        return self.results[i]
+
+    @property
+    def has_correlation_id(self) -> bool:
+        """Whether the results have correlation IDs."""
+        return len(self.results) > 0 and \
+            self.results[0].correlation_id is not None
+
+    def as_numpy(self, add_correlation: bool = True) -> \
+            Tuple[List[str], np.ndarray]:
+        """Return the Numpy array with column descriptors of the results.  Spacy
+        depends on Numpy, so this package will always be availale.
+
+        :param add_correlation: whether to add the correlation ID (if there is
+                                one), using :obj:`correlation_id_col`
+
+        """
+        cols: Set[str] = set()
+        rows: List[Dict[str, float]] = []
+        result: ScoreResult
+        for result in self.results:
+            row: Dict[str, float] = {}
+            rows.append(row)
+            meth: str
+            for meth, result in result.scores.items():
+                rdat: Dict[str, float] = result.asrow(meth)
+                row.update(rdat)
+                cols.update(rdat.keys())
+        cols: List[str] = sorted(cols)
+        nd_rows: List[np.ndarray] = []
+        for row in rows:
+            nd_rows.append(np.array(tuple(map(row.get, cols))))
+        arr = np.stack(nd_rows)
+        if add_correlation and self.has_correlation_id:
+            ids = np.array(tuple(map(lambda r: r.correlation_id, self.results)))
+            ids = np.expand_dims(ids, 1)
+            arr = np.append(arr, ids, axis=1)
+            cols.append(self.correlation_id_col)
+        return cols, arr
+
+    def as_dataframe(self, add_correlation: bool = True) -> pd.DataFrame:
+        """This gets data from :meth:`as_numpy` and returns it as a Pandas
+        dataframe.
+
+        :param add_correlation: whether to add the correlation ID (if there is
+                                one), using :obj:`correlation_id_col`
+
+        :return: an instance of :class:`pandas.DataFrame` of the results
+
+        """
+        import pandas as pd
+        cols, arr = self.as_numpy(add_correlation=False)
+        df = pd.DataFrame(arr, columns=cols)
+        if add_correlation and self.has_correlation_id:
+            # add as a dataframe, otherwise string correlation IDs cast the
+            # numpy array to a string
+            cid: str = self.correlation_id_col
+            cids: Tuple[Union[str, int]] = tuple(
+                map(lambda r: r.correlation_id, self.results))
+            if cid is None:
+                df.index = cids
+            else:
+                cols: List[str] = df.columns.tolist()
+                df[cid] = cids
+                cols.insert(0, cid)
+                df = df[cols]
+        return df
+
+
+@dataclass
+class ScoreMethod(metaclass=ABCMeta):
+    """An abstract base class for scoring methods (bleu, rouge, etc).
+
+    """
+    reverse_sents: bool = field(default=False)
+    """Whether to reverse the order of the sentences."""
+
+    @staticmethod
+    def get_module(name: str) -> ModuleType:
+        """Return the module that has ``name``.
+
+        :param name: the string name, which can have dots (``.``) to for sub
+                     modules
+
+        """
+        pkg_s = name.split('.')
+        mod = reduce(lambda m, n: getattr(m, n), pkg_s[1:], __import__(name))
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug(f'mod: {mod}')
+        return mod
+
+    @classmethod
+    def _get_external_modules(cls: Type) -> Tuple[str, ...]:
+        """Return a list of external module names needed by this method."""
+        return ()
+
+    @classmethod
+    def missing_modules(cls: Type) -> Tuple[str]:
+        """Return a list of missing modules neede by this score method."""
+        missing: List[str] = []
+        mod: str
+        for mod in cls._get_external_modules():
+            mod_name: str = re.sub(r'^([^=<>~]+).*', r'\1', mod)
+            try:
+                cls.get_module(mod_name)
+            except ModuleNotFoundError:
+                missing.append(mod)
+        return missing
+
+    @classmethod
+    def is_available(cls: Type) -> bool:
+        """Whether or not this method is available on this system."""
+        return len(cls.missing_modules()) == 0
+
+    @abstractmethod
+    def _score(self, meth: str, context: ScoreContext) -> Iterable[Score]:
+        """See :meth:`score`"""
+        pass
+
+    def score(self, meth: str, context: ScoreContext) -> Iterable[Score]:
+        """Score the sentences in ``context`` using method identifer ``meth``.
+
+        :param meth: the identifer such as ``bleu``
+
+        :param context: the context containing the data to score
+
+        :return: the results, which are usually :class:`float` or
+                 :class:`.Score`
+
+        """
+        scores: Iterable[Score]
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug(f'scoring meth: {meth}, ' +
+                         f'reverse: {self.reverse_sents}')
+        try:
+            if self.reverse_sents:
+                prev_pairs = context.pairs
+                try:
+                    context.pairs = tuple(map(
+                        lambda x: (x[1], x[0]), context.pairs))
+                    scores = self._score(meth, context)
+                finally:
+                    context.pairs = prev_pairs
+            else:
+                scores = self._score(meth, context)
+            # force generators to realize scores and force any raised exceptions
+            scores = tuple(scores)
+        except Exception as e:
+            logger.info(e, exc_info=True)
+            scores = tuple([ErrorScore(meth, e)] * len(context.pairs))
+        return scores
+
+
+@dataclass
+class LevenshteinDistanceScoreMethod(ScoreMethod):
+    """A scoring method that computes the Levenshtein distance.
+
+    """
+    normalize: bool = field(default=True)
+    """Whether to normalize  the return value as the *distince  / the max length
+    of both sentences*.
+
+    """
+    @classmethod
+    def _get_external_modules(cls: Type) -> Tuple[str, ...]:
+        return ('editdistance~=0.8.1',)
+
+    def _score(self, meth: str, context: ScoreContext) -> Iterable[FloatScore]:
+        import editdistance
+
+        for s1, s2 in context.pairs:
+            val: int = editdistance.eval(s1, s2)
+            if self.normalize:
+                text_len: int = max(len(s1), len(s2))
+                val = 1. - (val / text_len)
+            val: float = val
+            yield FloatScore(val)
+
+
+@dataclass
+class BleuScoreMethod(ScoreMethod):
+    """The BLEU scoring method using the :mod:`nltk` package.  The first
+    sentences are the references and the second are the hypothesis.
+
+    """
+    weights: Tuple[float, ...] = field(default=(0.25, 0.25, 0.25, 0.25))
+    """Weights for each n-gram.  For example: a tuple of float weights for
+    unigrams, bigrams, trigrams and so on can be given: ``weights = (0.1, 0.3,
+    0.5, 0.1)``.
+
+    """
+    silence_warnings: bool = field(default=True)
+    """Silence the BLEU warning of n-grams not matching ``The hypothesis
+    contains 0 counts of 3-gram overlaps...``
+
+    """
+    def __post_init__(self):
+        if self.silence_warnings:
+            import warnings
+            # silence the BLEU warning of n-grams not matching
+            # The hypothesis contains 0 counts of 3-gram overlaps...
+            warnings.filterwarnings(
+                'ignore', message='[.\n]+The hypothesis contains 0 counts.*')
+
+    @classmethod
+    def _get_external_modules(cls: Type) -> Tuple[str, ...]:
+        return ('nltk==3.9.1',)
+
+    def _score(self, meth: str, context: ScoreContext) -> Iterable[FloatScore]:
+        import nltk.translate.bleu_score as bleu
+        for s1, s2 in context.pairs:
+            val: float = bleu.sentence_bleu(
+                [s1.split()], s2.split(),
+                weights=self.weights)
+            yield FloatScore(val)
+
+
+@dataclass
+class RougeScoreMethod(ScoreMethod):
+    """The ROUGE scoring method using the :mod:`rouge_score` package.
+
+    """
+    @classmethod
+    def _get_external_modules(cls: Type) -> Tuple[str, ...]:
+        return ('rouge_score~=0.1.2',)
+
+    def _score(self, meth: str, context: ScoreContext) -> \
+            Iterable[HarmonicMeanScore]:
+        from rouge_score import rouge_scorer
+
+        scorer = rouge_scorer.RougeScorer([meth])
+        for s1, s2 in context.pairs:
+            res: Dict[str, Score] = scorer.score(s1, s2)
+            yield HarmonicMeanScore(*res[meth])
+
+
+@dataclass
+class Scorer(object):
+    """A class that scores sentences using a set of registered methods
+    (:obj:`methods`).
+
+    """
+    methods: Dict[str, ScoreMethod] = field(
+        default_factory=lambda: {
+            'levenshtein': LevenshteinDistanceScoreMethod(),
+            'bleu': BleuScoreMethod(),
+            'rouge1': RougeScoreMethod(),
+            'rouge2': RougeScoreMethod(),
+            'rouge3': RougeScoreMethod(),
+            'rouge4': RougeScoreMethod(),
+            'rouge5': RougeScoreMethod(),
+            'rouge6': RougeScoreMethod(),
+            'rouge7': RougeScoreMethod(),
+            'rouge8': RougeScoreMethod(),
+            'rouge9': RougeScoreMethod(),
+            'rougeL': RougeScoreMethod(),
+        })
+    """The registered scoring methods availale, which are accessed from
+    :obj:`.ScoreContext.meth`.
+
+    """
+    default_methods: Set[str] = field(default=None)
+    """Methods (keys from :obj:`methods`) to use when none are provided in the
+    :obj:`.ScoreContext.meth` in the call to :meth:`score`.
+
+    """
+    def _install(self, req: str):
+        pybin: str = sys.executable
+        cmd: str = f"{pybin} -m pip install '{req}'"
+        res: int = os.system(cmd)
+        if res != 0:
+            raise ScorerError(f"Pip eror installing '{req}'")
+
+    def _install_all(self, reqs: Tuple[str, ...]) -> bool:
+        req: str
+        for req in reqs:
+            try:
+                output: str = self._install(req)
+                if logger.isEnabledFor(logging.INFO):
+                    logger.info(f'installed: {req}')
+                if logger.isEnabledFor(logging.DEBUG):
+                    logger.debug(f'install output: <<{output}>>')
+            except Exception as e:
+                logger.warning(
+                    f"could not install scoring requirement '{req}': {e}")
+                return False
+        return True
+
+    def _get_missing_modules(self) -> Tuple[str, ...]:
+        missing: List[str] = []
+        not_avail: List[str] = []
+        name: str
+        meth: ScoreMethod
+        for name, meth in self.methods.items():
+            missing: Tuple[str, ...] = meth.missing_modules()
+            if len(missing) > 0 and not self._install_all(missing):
+                logger.warning(f'method {meth} is not available: ' +
+                               f'missing {missing}')
+                not_avail.append(name)
+                missing.extend(missing)
+        for name in not_avail:
+            del self.methods[name]
+        return tuple(missing)
+
+    def score(self, context: ScoreContext) -> ScoreSet:
+        """Score the sentences in ``context``.
+
+        :param context: the context containing the data to score
+
+        :return: the results for each method indicated in ``context``
+
+        """
+        by_meth: Dict[str, Tuple[Score, ...]] = {}
+        by_res: List[ScoreResult] = []
+        meths: Iterable[str] = context.methods
+        if meths is None:
+            if self.default_methods is None:
+                meths = self.methods.keys()
+            else:
+                meths = self.default_methods
+        self._get_missing_modules()
+        meth: str
+        for meth in meths:
+            smeth: ScoreMethod = self.methods.get(meth)
+            if smeth is None:
+                raise ScorerError(f"No scoring method: '{meth}'")
+            by_meth[meth] = tuple(smeth.score(meth, context))
+        for i in range(len(context.pairs)):
+            item_res: Dict[str, Score] = {}
+            corr_id: str = None
+            meth: str
+            if context.correlation_ids is not None:
+                corr_id = context.correlation_ids[i]
+            res_tup: Tuple[Score, ...]
+            # for each scored pair
+            for meth, res_tup in by_meth.items():
+                item_res[meth] = res_tup[i]
+            by_res.append(ScoreResult(item_res, correlation_id=corr_id))
+        return ScoreSet(results=tuple(by_res))
+
+    def __call__(self, context: ScoreContext) -> ScoreSet:
+        """See :meth:`score`."""
+        return self.score(context)
diff --git a/requirements-nlp.txt b/requirements-nlp.txt
new file mode 100644
index 000000000..b5e104b44
--- /dev/null
+++ b/requirements-nlp.txt
@@ -0,0 +1,5 @@
+numpy~=1.26.4
+pandas~=2.1.4
+scikit-learn~=1.3.2
+torch~=2.2.2
+transformers~=4.48.3
diff --git a/tests/base.py b/tests/base.py
new file mode 100644
index 000000000..007200279
--- /dev/null
+++ b/tests/base.py
@@ -0,0 +1,15 @@
+from typing import Type
+import unittest
+import logging
+
+
+class BaseTestCase(unittest.TestCase):
+    @staticmethod
+    def _setup_logging(level: int = logging.INFO):
+        logging.basicConfig(level=level)
+
+    @classmethod
+    def _set_debug(cls: Type):
+        cls._setup_logging(logging.DEBUG)
+        print()
+        print('_' * 80)
diff --git a/tests/nlp/__init__.py b/tests/nlp/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/nlp/test_metrics.py b/tests/nlp/test_metrics.py
new file mode 100644
index 000000000..56039902f
--- /dev/null
+++ b/tests/nlp/test_metrics.py
@@ -0,0 +1,50 @@
+from typing import List
+import logging
+from base import BaseTestCase
+from pathlib import Path
+import pandas as pd
+from pyhealth.nlp.metrics import (
+    LevenshteinDistanceScoreMethod,
+    ScoreContext, ScoreSet, ScoreResult, Scorer
+)
+
+
+class TestMetrics(BaseTestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # silence info messages from rouge package
+        self._setup_logging(logging.WARNING)
+
+    def setUp(self):
+        s1: str = 'The boy threw the ball. He practiced every day.'
+        s2: str = 'The boy threw X the ball. He practiced every day.'
+        self.pairs: List[List[str]] = [[s1, s1], [s1, s2]]
+
+    def _test_object_graph(self):
+        # configure only the edit distance method
+        scorer = Scorer(
+            methods={'editdistance': LevenshteinDistanceScoreMethod()},
+        )
+        ss: ScoreSet = scorer.score(ScoreContext(self.pairs))
+        self.assertEqual(ScoreSet, type(ss))
+        self.assertEqual(2, len(ss.results))
+        res1: ScoreResult = ss.results[0]
+        self.assertEqual(ScoreResult, type(res1))
+        self.assertEqual(1, len(res1.scores))
+        self.assertTrue('editdistance' in res1.scores)
+        self.assertEqual(1., res1.scores['editdistance'].value)
+
+    def test_pandas(self):
+        WRITE: bool = 0
+        should_file: Path = Path('test-resources/nlp/metrics.csv')
+        self._set_debug()
+        scorer = Scorer()
+        ss: ScoreSet = scorer.score(ScoreContext(self.pairs))
+        df: pd.DataFrame = ss.as_dataframe()
+        # give tolarance for arch high sig digits that might be off by epsilon
+        df = df.round(4)
+        if WRITE:
+            should_file.parent.mkdir(parents=True, exist_ok=True)
+            df.to_csv(should_file, index=False)
+        should: pd.DataFrame = pd.read_csv(should_file)
+        self.assertEqual(should.to_string(), df.to_string())

From 1afdce1271bf15334ca0458ea43cfe500d530a1e Mon Sep 17 00:00:00 2001
From: Paul Landes <landes@mailc.net>
Date: Thu, 22 May 2025 15:41:23 -0500
Subject: [PATCH 3/6] fix unit tests

---
 .github/workflows/run-unit-tests.yml | 29 ----------------------
 .github/workflows/test.yml           | 36 ++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 29 deletions(-)
 delete mode 100644 .github/workflows/run-unit-tests.yml
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/run-unit-tests.yml b/.github/workflows/run-unit-tests.yml
deleted file mode 100644
index cd2b6703f..000000000
--- a/.github/workflows/run-unit-tests.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: pytest unittest action script
-
-on: workflow_dispatch
-  
-jobs:
-  build:
-
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.9"]
-
-    steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
-          cache: 'pip' # caching pip dependencies
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install pytest
-          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-
-      - name: Test with pytest
-        run: |
-          cd pyhealth/unittests
-          pytest -rP
\ No newline at end of file
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 000000000..594c5eec0
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,36 @@
+name: CI
+
+on:
+  pull_request:
+  push:
+    paths-ignore:
+      - '**.md'
+      - 'doc/*'
+    branches:
+      - master
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.11']
+    steps:
+      - name: Checkout reposistory
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Display Python version
+        run: python -c "import sys; print(sys.version)"
+
+      - name: Install dependent packages
+        run: 'make deps'
+
+      - name: Run tests
+        run: 'make test'

From 787fe4894cb3345019a009e47054756bce08559f Mon Sep 17 00:00:00 2001
From: Paul Landes <landes@mailc.net>
Date: Thu, 22 May 2025 15:42:34 -0500
Subject: [PATCH 4/6] fix unit tests

---
 tests/nlp/test_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/nlp/test_metrics.py b/tests/nlp/test_metrics.py
index 56039902f..0536153e8 100644
--- a/tests/nlp/test_metrics.py
+++ b/tests/nlp/test_metrics.py
@@ -20,7 +20,7 @@ def setUp(self):
         s2: str = 'The boy threw X the ball. He practiced every day.'
         self.pairs: List[List[str]] = [[s1, s1], [s1, s2]]
 
-    def _test_object_graph(self):
+    def test_object_graph(self):
         # configure only the edit distance method
         scorer = Scorer(
             methods={'editdistance': LevenshteinDistanceScoreMethod()},

From da40b62f388e426d1d77323e8b3e4dd050ed155b Mon Sep 17 00:00:00 2001
From: Paul Landes <landes@mailc.net>
Date: Thu, 22 May 2025 15:46:05 -0500
Subject: [PATCH 5/6] add test resources; fix tests

---
 .gitignore                     | 2 +-
 test-resources/nlp/metrics.csv | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 test-resources/nlp/metrics.csv

diff --git a/.gitignore b/.gitignore
index a1b8c8b72..85a6aa6df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 .idea
 *DS_Store*
 *output*
-*resource*
+resource*
 
 debug_entry*
 playground.ipynb
diff --git a/test-resources/nlp/metrics.csv b/test-resources/nlp/metrics.csv
new file mode 100644
index 000000000..24f50dd16
--- /dev/null
+++ b/test-resources/nlp/metrics.csv
@@ -0,0 +1,3 @@
+bleu,levenshtein,rouge1_f_score,rouge1_precision,rouge1_recall,rouge2_f_score,rouge2_precision,rouge2_recall,rouge3_f_score,rouge3_precision,rouge3_recall,rouge4_f_score,rouge4_precision,rouge4_recall,rouge5_f_score,rouge5_precision,rouge5_recall,rouge6_f_score,rouge6_precision,rouge6_recall,rouge7_f_score,rouge7_precision,rouge7_recall,rouge8_f_score,rouge8_precision,rouge8_recall,rouge9_f_score,rouge9_precision,rouge9_recall,rougeL_f_score,rougeL_precision,rougeL_recall
+1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
+0.658,0.9592,0.9474,0.9,1.0,0.8235,0.7778,0.875,0.6667,0.625,0.7143,0.4615,0.4286,0.5,0.3636,0.3333,0.4,0.2222,0.2,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9474,0.9,1.0

From 2e26dd3abee8581e8b146bf53ffc4de4e9a5c260 Mon Sep 17 00:00:00 2001
From: Paul Landes <landes@mailc.net>
Date: Thu, 22 May 2025 15:48:54 -0500
Subject: [PATCH 6/6] add CI bulid status badge

---
 README.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.rst b/README.rst
index 8bd3cc1a4..0e44678a1 100644
--- a/README.rst
+++ b/README.rst
@@ -39,6 +39,9 @@ Welcome to PyHealth!
    :target: https://www.youtube.com/playlist?list=PLR3CNIF8DDHJUl8RLhyOVpX_kT4bxulEV
    :alt: YouTube
 
+.. image:: https://github.com/sunlabuiuc/PyHealth/workflows/CI/badge.svg
+   :target: https://github.com/sunlabuiuc/PyHealth/actions
+   :alt: CI status
 
 
 .. -----