From 5a649c58d7f75579d5cbfdf9a7f1f286a2edd6b5 Mon Sep 17 00:00:00 2001 From: aryo Date: Mon, 26 Jul 2021 21:11:46 +0200 Subject: [PATCH 01/12] replacing setup.py with poetry --- .gitignore | 147 ++++++++++- poetry.lock | 658 +++++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 29 +++ setup.py | 28 --- 4 files changed, 825 insertions(+), 37 deletions(-) create mode 100644 poetry.lock create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 434598a..cf1ee9e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,147 @@ -# Python cache files -*.pyc +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ # misc. *.DS_Store *.vscode *.bash_history misc/ -.coverage # data files /data/*.csv @@ -33,9 +168,3 @@ misc/ *.out *.gz *.toc - -# build files -build/ -pyComBat_test.egg-info/ -ComBat.egg-info/ -dist/ \ No newline at end of file diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..a9f856a --- /dev/null +++ b/poetry.lock @@ -0,0 +1,658 @@ +[[package]] +name = "appdirs" +version = "1.4.4" +description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "atomicwrites" +version = "1.4.0" +description = "Atomic file writes." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "attrs" +version = "21.2.0" +description = "Classes Without Boilerplate" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.extras] +dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"] +docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"] +tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"] + +[[package]] +name = "black" +version = "21.7b0" +description = "The uncompromising code formatter." +category = "dev" +optional = false +python-versions = ">=3.6.2" + +[package.dependencies] +appdirs = "*" +click = ">=7.1.2" +mypy-extensions = ">=0.4.3" +pathspec = ">=0.8.1,<1" +regex = ">=2020.1.8" +tomli = ">=0.2.6,<2.0.0" +typed-ast = {version = ">=1.4.2", markers = "python_version < \"3.8\""} +typing-extensions = {version = ">=3.7.4", markers = "python_version < \"3.8\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.6.0)", "aiohttp-cors (>=0.4.0)"] +python2 = ["typed-ast (>=1.4.2)"] +uvloop = ["uvloop (>=0.15.2)"] + +[[package]] +name = "click" +version = "8.0.1" +description = "Composable command line interface toolkit" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} + +[[package]] +name = "colorama" +version = "0.4.4" +description = "Cross-platform colored terminal text." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "importlib-metadata" +version = "1.7.0" +description = "Read metadata from Python packages" +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" + +[package.dependencies] +zipp = ">=0.5" + +[package.extras] +docs = ["sphinx", "rst.linker"] +testing = ["packaging", "pep517", "importlib-resources (>=1.3)"] + +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "joblib" +version = "1.0.1" +description = "Lightweight pipelining with Python functions" +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "mpmath" +version = "1.2.1" +description = "Python library for arbitrary-precision floating-point arithmetic" +category = "main" +optional = false +python-versions = "*" + +[package.extras] +develop = ["pytest (>=4.6)", "pycodestyle", "pytest-cov", "codecov", "wheel"] +tests = ["pytest (>=4.6)"] + +[[package]] +name = "mypy-extensions" +version = "0.4.3" +description = "Experimental type system extensions for programs checked with the mypy typechecker." +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "numpy" +version = "1.21.1" +description = "NumPy is the fundamental package for array computing with Python." +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "packaging" +version = "21.0" +description = "Core utilities for Python packages" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pyparsing = ">=2.0.2" + +[[package]] +name = "pandas" +version = "1.1.5" +description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" +optional = false +python-versions = ">=3.6.1" + +[package.dependencies] +numpy = ">=1.15.4" +python-dateutil = ">=2.7.3" +pytz = ">=2017.2" + +[package.extras] +test = ["pytest (>=4.0.2)", "pytest-xdist", "hypothesis (>=3.58)"] + +[[package]] +name = "pathspec" +version = "0.9.0" +description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" + +[[package]] +name = "patsy" +version = "0.5.1" +description = "A Python package for describing statistical models and for building design matrices." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +numpy = ">=1.4" +six = "*" + +[[package]] +name = "pluggy" +version = "0.13.1" +description = "plugin and hook calling mechanisms for python" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.dependencies] +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} + +[package.extras] +dev = ["pre-commit", "tox"] + +[[package]] +name = "poetry-core" +version = "1.0.3" +description = "Poetry PEP 517 Build Backend" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.dependencies] +importlib-metadata = {version = ">=1.7.0,<2.0.0", markers = "python_version >= \"2.7\" and python_version < \"2.8\" or python_version >= \"3.5\" and python_version < \"3.8\""} + +[[package]] +name = "poetry2setup" +version = "1.0.0" +description = "Convert python-poetry to setup.py" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.dependencies] +poetry-core = ">=1.0.0,<2.0.0" + +[[package]] +name = "py" +version = "1.10.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "pyparsing" +version = "2.4.7" +description = "Python parsing module" +category = "dev" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "pytest" +version = "6.2.4" +description = "pytest: simple powerful testing with Python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<1.0.0a1" +py = ">=1.8.2" +toml = "*" + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] + +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2021.1" +description = "World timezone definitions, modern and historical" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "regex" +version = "2021.7.6" +description = "Alternative regular expression module, to replace re." +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "scikit-learn" +version = "0.23.1" +description = "A set of python modules for machine learning and data mining" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +joblib = ">=0.11" +numpy = ">=1.13.3" +scipy = ">=0.19.1" +threadpoolctl = ">=2.0.0" + +[package.extras] +alldeps = ["numpy (>=1.13.3)", "scipy (>=0.19.1)"] + +[[package]] +name = "scipy" +version = "1.6.1" +description = "SciPy: Scientific Library for Python" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +numpy = ">=1.16.5" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "threadpoolctl" +version = "2.2.0" +description = "threadpoolctl" +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "toml" +version = "0.10.2" +description = "Python Library for Tom's Obvious, Minimal Language" +category = "dev" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "tomli" +version = "1.1.0" +description = "A lil' TOML parser" +category = "dev" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "typed-ast" +version = "1.4.3" +description = "a fork of Python 2 and 3 ast modules with type comment support" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "typing-extensions" +version = "3.10.0.0" +description = "Backported and Experimental Type Hints for Python 3.5+" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "zipp" +version = "3.5.0" +description = "Backport of pathlib-compatible object wrapper for zip files" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] +testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] + +[metadata] +lock-version = "1.1" +python-versions = "^3.7" +content-hash = "b8858bb42035be5883fcf2dc212b086044f13dbf3cee658b65c04f580aa7637c" + +[metadata.files] +appdirs = [ + {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, + {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, +] +atomicwrites = [ + {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, + {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, +] +attrs = [ + {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"}, + {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"}, +] +black = [ + {file = "black-21.7b0-py3-none-any.whl", hash = "sha256:1c7aa6ada8ee864db745b22790a32f94b2795c253a75d6d9b5e439ff10d23116"}, + {file = "black-21.7b0.tar.gz", hash = "sha256:c8373c6491de9362e39271630b65b964607bc5c79c83783547d76c839b3aa219"}, +] +click = [ + {file = "click-8.0.1-py3-none-any.whl", hash = "sha256:fba402a4a47334742d782209a7c79bc448911afe1149d07bdabdf480b3e2f4b6"}, + {file = "click-8.0.1.tar.gz", hash = "sha256:8c04c11192119b1ef78ea049e0a6f0463e4c48ef00a30160c704337586f3ad7a"}, +] +colorama = [ + {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, + {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, +] +importlib-metadata = [ + {file = "importlib_metadata-1.7.0-py2.py3-none-any.whl", hash = "sha256:dc15b2969b4ce36305c51eebe62d418ac7791e9a157911d58bfb1f9ccd8e2070"}, + {file = "importlib_metadata-1.7.0.tar.gz", hash = "sha256:90bb658cdbbf6d1735b6341ce708fc7024a3e14e99ffdc5783edea9f9b077f83"}, +] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] +joblib = [ + {file = "joblib-1.0.1-py3-none-any.whl", hash = "sha256:feeb1ec69c4d45129954f1b7034954241eedfd6ba39b5e9e4b6883be3332d5e5"}, + {file = "joblib-1.0.1.tar.gz", hash = "sha256:9c17567692206d2f3fb9ecf5e991084254fe631665c450b443761c4186a613f7"}, +] +mpmath = [ + {file = "mpmath-1.2.1-py3-none-any.whl", hash = "sha256:604bc21bd22d2322a177c73bdb573994ef76e62edd595d17e00aff24b0667e5c"}, + {file = "mpmath-1.2.1.tar.gz", hash = "sha256:79ffb45cf9f4b101a807595bcb3e72e0396202e0b1d25d689134b48c4216a81a"}, +] +mypy-extensions = [ + {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, + {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, +] +numpy = [ + {file = "numpy-1.21.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:38e8648f9449a549a7dfe8d8755a5979b45b3538520d1e735637ef28e8c2dc50"}, + {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fd7d7409fa643a91d0a05c7554dd68aa9c9bb16e186f6ccfe40d6e003156e33a"}, + {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a75b4498b1e93d8b700282dc8e655b8bd559c0904b3910b144646dbbbc03e062"}, + {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1412aa0aec3e00bc23fbb8664d76552b4efde98fb71f60737c83efbac24112f1"}, + {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e46ceaff65609b5399163de5893d8f2a82d3c77d5e56d976c8b5fb01faa6b671"}, + {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:c6a2324085dd52f96498419ba95b5777e40b6bcbc20088fddb9e8cbb58885e8e"}, + {file = "numpy-1.21.1-cp37-cp37m-win32.whl", hash = "sha256:73101b2a1fef16602696d133db402a7e7586654682244344b8329cdcbbb82172"}, + {file = "numpy-1.21.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7a708a79c9a9d26904d1cca8d383bf869edf6f8e7650d85dbc77b041e8c5a0f8"}, + {file = "numpy-1.21.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95b995d0c413f5d0428b3f880e8fe1660ff9396dcd1f9eedbc311f37b5652e16"}, + {file = "numpy-1.21.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:635e6bd31c9fb3d475c8f44a089569070d10a9ef18ed13738b03049280281267"}, + {file = "numpy-1.21.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4a3d5fb89bfe21be2ef47c0614b9c9c707b7362386c9a3ff1feae63e0267ccb6"}, + {file = "numpy-1.21.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8a326af80e86d0e9ce92bcc1e65c8ff88297de4fa14ee936cb2293d414c9ec63"}, + {file = "numpy-1.21.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:791492091744b0fe390a6ce85cc1bf5149968ac7d5f0477288f78c89b385d9af"}, + {file = "numpy-1.21.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0318c465786c1f63ac05d7c4dbcecd4d2d7e13f0959b01b534ea1e92202235c5"}, + {file = "numpy-1.21.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a513bd9c1551894ee3d31369f9b07460ef223694098cf27d399513415855b68"}, + {file = "numpy-1.21.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:91c6f5fc58df1e0a3cc0c3a717bb3308ff850abdaa6d2d802573ee2b11f674a8"}, + {file = "numpy-1.21.1-cp38-cp38-win32.whl", hash = "sha256:978010b68e17150db8765355d1ccdd450f9fc916824e8c4e35ee620590e234cd"}, + {file = "numpy-1.21.1-cp38-cp38-win_amd64.whl", hash = "sha256:9749a40a5b22333467f02fe11edc98f022133ee1bfa8ab99bda5e5437b831214"}, + {file = "numpy-1.21.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d7a4aeac3b94af92a9373d6e77b37691b86411f9745190d2c351f410ab3a791f"}, + {file = "numpy-1.21.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d9e7912a56108aba9b31df688a4c4f5cb0d9d3787386b87d504762b6754fbb1b"}, + {file = "numpy-1.21.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:25b40b98ebdd272bc3020935427a4530b7d60dfbe1ab9381a39147834e985eac"}, + {file = "numpy-1.21.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8a92c5aea763d14ba9d6475803fc7904bda7decc2a0a68153f587ad82941fec1"}, + {file = "numpy-1.21.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:05a0f648eb28bae4bcb204e6fd14603de2908de982e761a2fc78efe0f19e96e1"}, + {file = "numpy-1.21.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f01f28075a92eede918b965e86e8f0ba7b7797a95aa8d35e1cc8821f5fc3ad6a"}, + {file = "numpy-1.21.1-cp39-cp39-win32.whl", hash = "sha256:88c0b89ad1cc24a5efbb99ff9ab5db0f9a86e9cc50240177a571fbe9c2860ac2"}, + {file = "numpy-1.21.1-cp39-cp39-win_amd64.whl", hash = "sha256:01721eefe70544d548425a07c80be8377096a54118070b8a62476866d5208e33"}, + {file = "numpy-1.21.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2d4d1de6e6fb3d28781c73fbde702ac97f03d79e4ffd6598b880b2d95d62ead4"}, + {file = "numpy-1.21.1.zip", hash = "sha256:dff4af63638afcc57a3dfb9e4b26d434a7a602d225b42d746ea7fe2edf1342fd"}, +] +packaging = [ + {file = "packaging-21.0-py3-none-any.whl", hash = "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"}, + {file = "packaging-21.0.tar.gz", hash = "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7"}, +] +pandas = [ + {file = "pandas-1.1.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:bf23a3b54d128b50f4f9d4675b3c1857a688cc6731a32f931837d72effb2698d"}, + {file = "pandas-1.1.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:5a780260afc88268a9d3ac3511d8f494fdcf637eece62fb9eb656a63d53eb7ca"}, + {file = "pandas-1.1.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:b61080750d19a0122469ab59b087380721d6b72a4e7d962e4d7e63e0c4504814"}, + {file = "pandas-1.1.5-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:0de3ddb414d30798cbf56e642d82cac30a80223ad6fe484d66c0ce01a84d6f2f"}, + {file = "pandas-1.1.5-cp36-cp36m-win32.whl", hash = "sha256:70865f96bb38fec46f7ebd66d4b5cfd0aa6b842073f298d621385ae3898d28b5"}, + {file = "pandas-1.1.5-cp36-cp36m-win_amd64.whl", hash = "sha256:19a2148a1d02791352e9fa637899a78e371a3516ac6da5c4edc718f60cbae648"}, + {file = "pandas-1.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:26fa92d3ac743a149a31b21d6f4337b0594b6302ea5575b37af9ca9611e8981a"}, + {file = "pandas-1.1.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:c16d59c15d946111d2716856dd5479221c9e4f2f5c7bc2d617f39d870031e086"}, + {file = "pandas-1.1.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:3be7a7a0ca71a2640e81d9276f526bca63505850add10206d0da2e8a0a325dae"}, + {file = "pandas-1.1.5-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:573fba5b05bf2c69271a32e52399c8de599e4a15ab7cec47d3b9c904125ab788"}, + {file = "pandas-1.1.5-cp37-cp37m-win32.whl", hash = "sha256:21b5a2b033380adbdd36b3116faaf9a4663e375325831dac1b519a44f9e439bb"}, + {file = "pandas-1.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:24c7f8d4aee71bfa6401faeba367dd654f696a77151a8a28bc2013f7ced4af98"}, + {file = "pandas-1.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2860a97cbb25444ffc0088b457da0a79dc79f9c601238a3e0644312fcc14bf11"}, + {file = "pandas-1.1.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:5008374ebb990dad9ed48b0f5d0038124c73748f5384cc8c46904dace27082d9"}, + {file = "pandas-1.1.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:2c2f7c670ea4e60318e4b7e474d56447cf0c7d83b3c2a5405a0dbb2600b9c48e"}, + {file = "pandas-1.1.5-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0a643bae4283a37732ddfcecab3f62dd082996021b980f580903f4e8e01b3c5b"}, + {file = "pandas-1.1.5-cp38-cp38-win32.whl", hash = "sha256:5447ea7af4005b0daf695a316a423b96374c9c73ffbd4533209c5ddc369e644b"}, + {file = "pandas-1.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:4c62e94d5d49db116bef1bd5c2486723a292d79409fc9abd51adf9e05329101d"}, + {file = "pandas-1.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:731568be71fba1e13cae212c362f3d2ca8932e83cb1b85e3f1b4dd77d019254a"}, + {file = "pandas-1.1.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:c61c043aafb69329d0f961b19faa30b1dab709dd34c9388143fc55680059e55a"}, + {file = "pandas-1.1.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:2b1c6cd28a0dfda75c7b5957363333f01d370936e4c6276b7b8e696dd500582a"}, + {file = "pandas-1.1.5-cp39-cp39-win32.whl", hash = "sha256:c94ff2780a1fd89f190390130d6d36173ca59fcfb3fe0ff596f9a56518191ccb"}, + {file = "pandas-1.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:edda9bacc3843dfbeebaf7a701763e68e741b08fccb889c003b0a52f0ee95782"}, + {file = "pandas-1.1.5.tar.gz", hash = "sha256:f10fc41ee3c75a474d3bdf68d396f10782d013d7f67db99c0efbfd0acb99701b"}, +] +pathspec = [ + {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"}, + {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"}, +] +patsy = [ + {file = "patsy-0.5.1-py2.py3-none-any.whl", hash = "sha256:5465be1c0e670c3a965355ec09e9a502bf2c4cbe4875e8528b0221190a8a5d40"}, + {file = "patsy-0.5.1.tar.gz", hash = "sha256:f115cec4201e1465cd58b9866b0b0e7b941caafec129869057405bfe5b5e3991"}, +] +pluggy = [ + {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, + {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, +] +poetry-core = [ + {file = "poetry-core-1.0.3.tar.gz", hash = "sha256:2315c928249fc3207801a81868b64c66273077b26c8d8da465dccf8f488c90c5"}, + {file = "poetry_core-1.0.3-py2.py3-none-any.whl", hash = "sha256:c6bde46251112de8384013e1ab8d66e7323d2c75172f80220aba2bc07e208e9a"}, +] +poetry2setup = [ + {file = "poetry2setup-1.0.0-py2.py3-none-any.whl", hash = "sha256:b69a4efabfda24870fd8d08b37b15fbbee5eec0b62711feaef610f94835517be"}, + {file = "poetry2setup-1.0.0.tar.gz", hash = "sha256:ef1177303996b661eeec3de5027a1af84e106a1d2cb92c73152fe6ce47700cc3"}, +] +py = [ + {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, + {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, +] +pyparsing = [ + {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, + {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, +] +pytest = [ + {file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"}, + {file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"}, +] +python-dateutil = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] +pytz = [ + {file = "pytz-2021.1-py2.py3-none-any.whl", hash = "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"}, + {file = "pytz-2021.1.tar.gz", hash = "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da"}, +] +regex = [ + {file = "regex-2021.7.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:e6a1e5ca97d411a461041d057348e578dc344ecd2add3555aedba3b408c9f874"}, + {file = "regex-2021.7.6-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:6afe6a627888c9a6cfbb603d1d017ce204cebd589d66e0703309b8048c3b0854"}, + {file = "regex-2021.7.6-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:ccb3d2190476d00414aab36cca453e4596e8f70a206e2aa8db3d495a109153d2"}, + {file = "regex-2021.7.6-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:ed693137a9187052fc46eedfafdcb74e09917166362af4cc4fddc3b31560e93d"}, + {file = "regex-2021.7.6-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:99d8ab206a5270c1002bfcf25c51bf329ca951e5a169f3b43214fdda1f0b5f0d"}, + {file = "regex-2021.7.6-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:b85ac458354165405c8a84725de7bbd07b00d9f72c31a60ffbf96bb38d3e25fa"}, + {file = "regex-2021.7.6-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:3f5716923d3d0bfb27048242a6e0f14eecdb2e2a7fac47eda1d055288595f222"}, + {file = "regex-2021.7.6-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5983c19d0beb6af88cb4d47afb92d96751fb3fa1784d8785b1cdf14c6519407"}, + {file = "regex-2021.7.6-cp36-cp36m-win32.whl", hash = "sha256:c92831dac113a6e0ab28bc98f33781383fe294df1a2c3dfd1e850114da35fd5b"}, + {file = "regex-2021.7.6-cp36-cp36m-win_amd64.whl", hash = "sha256:791aa1b300e5b6e5d597c37c346fb4d66422178566bbb426dd87eaae475053fb"}, + {file = "regex-2021.7.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:59506c6e8bd9306cd8a41511e32d16d5d1194110b8cfe5a11d102d8b63cf945d"}, + {file = "regex-2021.7.6-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:564a4c8a29435d1f2256ba247a0315325ea63335508ad8ed938a4f14c4116a5d"}, + {file = "regex-2021.7.6-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:59c00bb8dd8775473cbfb967925ad2c3ecc8886b3b2d0c90a8e2707e06c743f0"}, + {file = "regex-2021.7.6-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:9a854b916806c7e3b40e6616ac9e85d3cdb7649d9e6590653deb5b341a736cec"}, + {file = "regex-2021.7.6-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:db2b7df831c3187a37f3bb80ec095f249fa276dbe09abd3d35297fc250385694"}, + {file = "regex-2021.7.6-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:173bc44ff95bc1e96398c38f3629d86fa72e539c79900283afa895694229fe6a"}, + {file = "regex-2021.7.6-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:15dddb19823f5147e7517bb12635b3c82e6f2a3a6b696cc3e321522e8b9308ad"}, + {file = "regex-2021.7.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ddeabc7652024803666ea09f32dd1ed40a0579b6fbb2a213eba590683025895"}, + {file = "regex-2021.7.6-cp37-cp37m-win32.whl", hash = "sha256:f080248b3e029d052bf74a897b9d74cfb7643537fbde97fe8225a6467fb559b5"}, + {file = "regex-2021.7.6-cp37-cp37m-win_amd64.whl", hash = "sha256:d8bbce0c96462dbceaa7ac4a7dfbbee92745b801b24bce10a98d2f2b1ea9432f"}, + {file = "regex-2021.7.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:edd1a68f79b89b0c57339bce297ad5d5ffcc6ae7e1afdb10f1947706ed066c9c"}, + {file = "regex-2021.7.6-cp38-cp38-manylinux1_i686.whl", hash = "sha256:422dec1e7cbb2efbbe50e3f1de36b82906def93ed48da12d1714cabcd993d7f0"}, + {file = "regex-2021.7.6-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:cbe23b323988a04c3e5b0c387fe3f8f363bf06c0680daf775875d979e376bd26"}, + {file = "regex-2021.7.6-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:0eb2c6e0fcec5e0f1d3bcc1133556563222a2ffd2211945d7b1480c1b1a42a6f"}, + {file = "regex-2021.7.6-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:1c78780bf46d620ff4fff40728f98b8afd8b8e35c3efd638c7df67be2d5cddbf"}, + {file = "regex-2021.7.6-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:bc84fb254a875a9f66616ed4538542fb7965db6356f3df571d783f7c8d256edd"}, + {file = "regex-2021.7.6-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:598c0a79b4b851b922f504f9f39a863d83ebdfff787261a5ed061c21e67dd761"}, + {file = "regex-2021.7.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:875c355360d0f8d3d827e462b29ea7682bf52327d500a4f837e934e9e4656068"}, + {file = "regex-2021.7.6-cp38-cp38-win32.whl", hash = "sha256:e586f448df2bbc37dfadccdb7ccd125c62b4348cb90c10840d695592aa1b29e0"}, + {file = "regex-2021.7.6-cp38-cp38-win_amd64.whl", hash = "sha256:2fe5e71e11a54e3355fa272137d521a40aace5d937d08b494bed4529964c19c4"}, + {file = "regex-2021.7.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6110bab7eab6566492618540c70edd4d2a18f40ca1d51d704f1d81c52d245026"}, + {file = "regex-2021.7.6-cp39-cp39-manylinux1_i686.whl", hash = "sha256:4f64fc59fd5b10557f6cd0937e1597af022ad9b27d454e182485f1db3008f417"}, + {file = "regex-2021.7.6-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:89e5528803566af4df368df2d6f503c84fbfb8249e6631c7b025fe23e6bd0cde"}, + {file = "regex-2021.7.6-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2366fe0479ca0e9afa534174faa2beae87847d208d457d200183f28c74eaea59"}, + {file = "regex-2021.7.6-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:f9392a4555f3e4cb45310a65b403d86b589adc773898c25a39184b1ba4db8985"}, + {file = "regex-2021.7.6-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:2bceeb491b38225b1fee4517107b8491ba54fba77cf22a12e996d96a3c55613d"}, + {file = "regex-2021.7.6-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:f98dc35ab9a749276f1a4a38ab3e0e2ba1662ce710f6530f5b0a6656f1c32b58"}, + {file = "regex-2021.7.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:319eb2a8d0888fa6f1d9177705f341bc9455a2c8aca130016e52c7fe8d6c37a3"}, + {file = "regex-2021.7.6-cp39-cp39-win32.whl", hash = "sha256:eaf58b9e30e0e546cdc3ac06cf9165a1ca5b3de8221e9df679416ca667972035"}, + {file = "regex-2021.7.6-cp39-cp39-win_amd64.whl", hash = "sha256:4c9c3155fe74269f61e27617529b7f09552fbb12e44b1189cebbdb24294e6e1c"}, + {file = "regex-2021.7.6.tar.gz", hash = "sha256:8394e266005f2d8c6f0bc6780001f7afa3ef81a7a2111fa35058ded6fce79e4d"}, +] +scikit-learn = [ + {file = "scikit-learn-0.23.1.tar.gz", hash = "sha256:e3fec1c8831f8f93ad85581ca29ca1bb88e2da377fb097cf8322aa89c21bc9b8"}, + {file = "scikit_learn-0.23.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:058d213092de4384710137af1300ed0ff030b8c40459a6c6f73c31ccd274cc39"}, + {file = "scikit_learn-0.23.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:ebe853e6f318f9d8b3b74dd17e553720d35646eff675a69eeaed12fbbbb07daa"}, + {file = "scikit_learn-0.23.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:e9879ba9e64ec3add41bf201e06034162f853652ef4849b361d73b0deb3153ad"}, + {file = "scikit_learn-0.23.1-cp36-cp36m-win32.whl", hash = "sha256:c2fa33d20408b513cf432505c80e6eb4bf4d71434f1ae36680765d4a2c2a16ec"}, + {file = "scikit_learn-0.23.1-cp36-cp36m-win_amd64.whl", hash = "sha256:e585682e37f2faa81ad6cd4472fff646bf2fd0542147bec93697a905db8e6bd2"}, + {file = "scikit_learn-0.23.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:244ca85d6eba17a1e6e8a66ab2f584be6a7784b5f59297e3d7ff8c7983af627c"}, + {file = "scikit_learn-0.23.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:9e04c0811ea92931ee8490d638171b8cb2f21387efcfff526bbc8c2a3da60f1c"}, + {file = "scikit_learn-0.23.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:0e7b55f73b35537ecd0d19df29dd39aa9e076dba78f3507b8136c819d84611fd"}, + {file = "scikit_learn-0.23.1-cp37-cp37m-win32.whl", hash = "sha256:bded94236e16774385202cafd26190ce96db18e4dc21e99473848c61e4fdc400"}, + {file = "scikit_learn-0.23.1-cp37-cp37m-win_amd64.whl", hash = "sha256:04799686060ecbf8992f26a35be1d99e981894c8c7860c1365cda4200f954a16"}, + {file = "scikit_learn-0.23.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c3464e46ef8bd4f1bfa5c009648c6449412c8f7e9b3fc0c9e3d800139c48827"}, + {file = "scikit_learn-0.23.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:93f56abd316d131645559ec0ab4f45e3391c2ccdd4eadaa4912f4c1e0a6f2c96"}, + {file = "scikit_learn-0.23.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:3e6e92b495eee193a8fa12a230c9b7976ea0fc1263719338e35c986ea1e42cff"}, + {file = "scikit_learn-0.23.1-cp38-cp38-win32.whl", hash = "sha256:5bcea4d6ee431c814261117281363208408aa4e665633655895feb059021aca6"}, + {file = "scikit_learn-0.23.1-cp38-cp38-win_amd64.whl", hash = "sha256:16feae4361be6b299d4d08df5a30956b4bfc8eadf173fe9258f6d59630f851d4"}, +] +scipy = [ + {file = "scipy-1.6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a15a1f3fc0abff33e792d6049161b7795909b40b97c6cc2934ed54384017ab76"}, + {file = "scipy-1.6.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e79570979ccdc3d165456dd62041d9556fb9733b86b4b6d818af7a0afc15f092"}, + {file = "scipy-1.6.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:a423533c55fec61456dedee7b6ee7dce0bb6bfa395424ea374d25afa262be261"}, + {file = "scipy-1.6.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:33d6b7df40d197bdd3049d64e8e680227151673465e5d85723b3b8f6b15a6ced"}, + {file = "scipy-1.6.1-cp37-cp37m-win32.whl", hash = "sha256:6725e3fbb47da428794f243864f2297462e9ee448297c93ed1dcbc44335feb78"}, + {file = "scipy-1.6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:5fa9c6530b1661f1370bcd332a1e62ca7881785cc0f80c0d559b636567fab63c"}, + {file = "scipy-1.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bd50daf727f7c195e26f27467c85ce653d41df4358a25b32434a50d8870fc519"}, + {file = "scipy-1.6.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:f46dd15335e8a320b0fb4685f58b7471702234cba8bb3442b69a3e1dc329c345"}, + {file = "scipy-1.6.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0e5b0ccf63155d90da576edd2768b66fb276446c371b73841e3503be1d63fb5d"}, + {file = "scipy-1.6.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:2481efbb3740977e3c831edfd0bd9867be26387cacf24eb5e366a6a374d3d00d"}, + {file = "scipy-1.6.1-cp38-cp38-win32.whl", hash = "sha256:68cb4c424112cd4be886b4d979c5497fba190714085f46b8ae67a5e4416c32b4"}, + {file = "scipy-1.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:5f331eeed0297232d2e6eea51b54e8278ed8bb10b099f69c44e2558c090d06bf"}, + {file = "scipy-1.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0c8a51d33556bf70367452d4d601d1742c0e806cd0194785914daf19775f0e67"}, + {file = "scipy-1.6.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:83bf7c16245c15bc58ee76c5418e46ea1811edcc2e2b03041b804e46084ab627"}, + {file = "scipy-1.6.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:794e768cc5f779736593046c9714e0f3a5940bc6dcc1dba885ad64cbfb28e9f0"}, + {file = "scipy-1.6.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:5da5471aed911fe7e52b86bf9ea32fb55ae93e2f0fac66c32e58897cfb02fa07"}, + {file = "scipy-1.6.1-cp39-cp39-win32.whl", hash = "sha256:8e403a337749ed40af60e537cc4d4c03febddcc56cd26e774c9b1b600a70d3e4"}, + {file = "scipy-1.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:a5193a098ae9f29af283dcf0041f762601faf2e595c0db1da929875b7570353f"}, + {file = "scipy-1.6.1.tar.gz", hash = "sha256:c4fceb864890b6168e79b0e714c585dbe2fd4222768ee90bc1aa0f8218691b11"}, +] +six = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] +threadpoolctl = [ + {file = "threadpoolctl-2.2.0-py3-none-any.whl", hash = "sha256:e5a995e3ffae202758fa8a90082e35783b9370699627ae2733cd1c3a73553616"}, + {file = "threadpoolctl-2.2.0.tar.gz", hash = "sha256:86d4b6801456d780e94681d155779058759eaef3c3564758b17b6c99db5f81cb"}, +] +toml = [ + {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, + {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, +] +tomli = [ + {file = "tomli-1.1.0-py3-none-any.whl", hash = "sha256:f4a182048010e89cbec0ae4686b21f550a7f2903f665e34a6de58ec15424f919"}, + {file = "tomli-1.1.0.tar.gz", hash = "sha256:33d7984738f8bb699c9b0a816eb646a8178a69eaa792d258486776a5d21b8ca5"}, +] +typed-ast = [ + {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6"}, + {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075"}, + {file = "typed_ast-1.4.3-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528"}, + {file = "typed_ast-1.4.3-cp35-cp35m-win32.whl", hash = "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428"}, + {file = "typed_ast-1.4.3-cp35-cp35m-win_amd64.whl", hash = "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3"}, + {file = "typed_ast-1.4.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f"}, + {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341"}, + {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace"}, + {file = "typed_ast-1.4.3-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f"}, + {file = "typed_ast-1.4.3-cp36-cp36m-win32.whl", hash = "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363"}, + {file = "typed_ast-1.4.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7"}, + {file = "typed_ast-1.4.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266"}, + {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e"}, + {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04"}, + {file = "typed_ast-1.4.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899"}, + {file = "typed_ast-1.4.3-cp37-cp37m-win32.whl", hash = "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c"}, + {file = "typed_ast-1.4.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805"}, + {file = "typed_ast-1.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a"}, + {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff"}, + {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41"}, + {file = "typed_ast-1.4.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39"}, + {file = "typed_ast-1.4.3-cp38-cp38-win32.whl", hash = "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927"}, + {file = "typed_ast-1.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40"}, + {file = "typed_ast-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3"}, + {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4"}, + {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0"}, + {file = "typed_ast-1.4.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3"}, + {file = "typed_ast-1.4.3-cp39-cp39-win32.whl", hash = "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808"}, + {file = "typed_ast-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c"}, + {file = "typed_ast-1.4.3.tar.gz", hash = "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"}, +] +typing-extensions = [ + {file = "typing_extensions-3.10.0.0-py2-none-any.whl", hash = "sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497"}, + {file = "typing_extensions-3.10.0.0-py3-none-any.whl", hash = "sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"}, + {file = "typing_extensions-3.10.0.0.tar.gz", hash = "sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342"}, +] +zipp = [ + {file = "zipp-3.5.0-py3-none-any.whl", hash = "sha256:957cfda87797e389580cb8b9e3870841ca991e2125350677b2ca83a0e99390a3"}, + {file = "zipp-3.5.0.tar.gz", hash = "sha256:f5812b1e007e48cff63449a5e9f4e7ebea716b4111f9c4f9a645f91d579bf0c4"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ac2410f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,29 @@ +[tool.poetry] +name = "pycombat" +version = "0.3.1" +description = "pyComBat is a Python 3 implementation of ComBat, one of the most widely used tool for correcting technical biases, called batch effects, in microarray expression data." +authors = ["Abdelkader Behdenna ", "Guillaume Appé ", "Aryo Gema "] +license = "GPL-3.0-or-later" +readme = "README.md" +homepage = "https://github.com/epigenelabs/pyComBat" +repository = "https://github.com/epigenelabs/pyComBat" +include = [ + "LICENSE", +] + +[tool.poetry.dependencies] +python = "^3.7" +numpy = "^1.18.5" +mpmath = "^1.1.0" +pandas = "^1.1.5" +patsy = "^0.5.1" +scikit-learn = "0.23.1" +typing-extensions = "^3.10.0" + +[tool.poetry.dev-dependencies] +pytest = "^6.2.4" +black = "^21.5b1" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/setup.py b/setup.py deleted file mode 100644 index 773f09a..0000000 --- a/setup.py +++ /dev/null @@ -1,28 +0,0 @@ -import setuptools - -with open("README.md", "r") as fh: - long_description = fh.read() - -setuptools.setup( - install_requires=[ - "numpy==1.18.5", - "mpmath==1.1.0", - "pandas==1.1.5", - "patsy==0.5.1" - ], - name="combat", - version="0.3.0", - author="Abdelkader Behdenna", - author_email="abdelkader@epigenelabs.com", - description="pyComBat, a Python tool for batch effects correction in high-throughput molecular data using empirical Bayes methods", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/epigenelabs/pyComBat", - packages=setuptools.find_packages(), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", - "Operating System :: OS Independent", - ], - python_requires='>=3.6', -) \ No newline at end of file From 02e8c34a445ded9407a057da724af776168c1adc Mon Sep 17 00:00:00 2001 From: aryo Date: Mon, 26 Jul 2021 21:12:14 +0200 Subject: [PATCH 02/12] Combat update v0.3.1 --- combat/__init__.py | 1 + combat/pycombat.py | 762 ++++++----------------------------- combat/test_unit.py | 211 ---------- combat/utils/__init__.py | 2 + combat/utils/combat_utils.py | 547 +++++++++++++++++++++++++ combat/utils/common_utils.py | 41 ++ 6 files changed, 718 insertions(+), 846 deletions(-) delete mode 100644 combat/test_unit.py create mode 100644 combat/utils/__init__.py create mode 100644 combat/utils/combat_utils.py create mode 100644 combat/utils/common_utils.py diff --git a/combat/__init__.py b/combat/__init__.py index e69de29..d1d2ad9 100644 --- a/combat/__init__.py +++ b/combat/__init__.py @@ -0,0 +1 @@ +from .pycombat import PyCombat \ No newline at end of file diff --git a/combat/pycombat.py b/combat/pycombat.py index cfc4801..cc12bd6 100644 --- a/combat/pycombat.py +++ b/combat/pycombat.py @@ -18,656 +18,148 @@ # file pycombat.py # author A. Behdenna, J. Haziza, A. Gema, A. Nordor -# date Sept 2020 +# date Sept 2020 #----------------------------------------------------------------------------- +from __future__ import annotations import numpy as np -from math import exp -from multiprocessing import Pool, cpu_count -from functools import partial -import mpmath as mp import pandas as pd -#import unittest - - -def model_matrix(info, intercept=True, drop_first=True): - """Creates the model_matrix from batch list - - Arguments: - info {list} -- list info with batch or covariates data - intercept {bool} -- boolean for intercept in model matrix - - Returns: - matrix -- model matrix generate from batch list - """ - if not isinstance(info[0],list) : - info = [info] - else: - info = info - info_dict = {} - for i in range(len(info)): - info_dict[f"col{str(i)}"] = list(map(str,info[i])) - df = pd.get_dummies(pd.DataFrame(info_dict), drop_first=drop_first, dtype=float) - if intercept: - df["intercept"] = 1.0 - return df.to_numpy() - - -def all_1(list_of_elements): - """checks if all elements in a list are 1s - - Arguments: - list_of_elements {list} -- list of elements - - Returns: - bool -- True iff all elements of the list are 1s - """ - return((list_of_elements == 1).all()) - - -# aprior and bprior are useful to compute "hyper-prior values" -# -> prior parameters used to estimate the prior gamma distribution for multiplicative batch effect -# aprior - calculates empirical hyper-prior values - -def compute_prior(prior, gamma_hat, mean_only): - """[summary] - - Arguments: - prior {char} -- 'a' or 'b' depending of the prior to be calculated - gamma_hat {matrix} -- matrix of additive batch effect - mean_only {bool} -- True iff mean_only selected - - Returns: - float -- [the prior calculated (aprior or bprior) - """ - if mean_only: - return 1 - m = np.mean(gamma_hat) - s2 = np.var(gamma_hat) - if prior == 'a': - return (2*s2+m*m)/s2 - elif prior == 'b': - return (m*s2+m*m*m)/s2 - - -def postmean(g_bar, d_star, t2_n, t2_n_g_hat): - """estimates additive batch effect - - Arguments: - g_bar {matrix} -- additive batch effect - d_star {matrix} -- multiplicative batch effect - t2_n {matrix} -- - t2_n_g_hat {matrix} -- - - Returns: - matrix -- estimated additive batch effect - """ - return np.divide(t2_n_g_hat+d_star*g_bar, np.asarray(t2_n+d_star)) - - -def postvar(sum2, n, a, b): - """estimates multiplicative batch effect - - Arguments: - sum2 {vector} -- - n {[type]} -- - a {float} -- aprior - b {float} -- bprior - - Returns: - matrix -- estimated multiplicative batch effect - """ - return(np.divide((np.multiply(0.5, sum2)+b), (np.multiply(0.5, n)+a-1))) - - -def it_sol(sdat, g_hat, d_hat, g_bar, t2, a, b, conv=0.0001, exit_iteration=10e5): - """iterative solution for Empirical Bayesian method - - Arguments: - sdat {matrix} -- - g_hat {matrix} -- average additive batch effect - d_hat {matrix} -- average multiplicative batch effect - g_bar {matrix} -- additive batch effect - t2 {matrix} -- - a {float} -- aprior - b {float} -- bprior - - Keyword Arguments: - conv {float} -- convergence criterion (default: {0.0001}) - exit_iteration {float} -- maximum number of iterations before exit (default: {10e5}) - - Returns: - array list -- estimated additive and multiplicative batch effect - """ - - n = [len(i) for i in np.asarray(sdat)] - t2_n = np.multiply(t2, n) - t2_n_g_hat = np.multiply(t2_n, g_hat) - g_old = np.ndarray.copy(g_hat) - d_old = np.ndarray.copy(d_hat) - change = 1 - count = 0 # number of steps needed (for diagnostic only) - # convergence criteria, if new-old < conv, then stop - while (change > conv) and (count < exit_iteration): - g_new = postmean(g_bar, d_old, t2_n, t2_n_g_hat) # updated additive batch effect - sum2 = np.sum(np.asarray(np.square( - sdat-np.outer(g_new[0][0], np.ones(np.ma.size(sdat, axis=1))))), axis=1) - d_new = postvar(sum2, n, a, b) # updated multiplicative batch effect - change = max(np.amax(np.absolute(g_new-np.asarray(g_old))/np.asarray(g_old)), np.amax( - np.absolute(d_new-d_old)/d_old)) # maximum difference between new and old estimate - g_old = np.ndarray.copy(g_new) # save value for g - d_old = np.ndarray.copy(d_new) # save value for d - count += 1 - adjust = np.asarray([g_new, d_new]) - return(adjust) # remove parenthesis in returns - -# int_eprior - Monte Carlo integration function to find nonparametric adjustments -# Johnson et al (Biostatistics 2007, supp.mat.) show that we can estimate the multiplicative and additive batch effects with an integral -# This integral is numerically computed through Monte Carlo inegration (iterative method) - - -def int_eprior(sdat, g_hat, d_hat, precision): - """ int_eprior - Monte Carlo integration function to find nonparametric adjustments - Johnson et al (Biostatistics 2007, supp.mat.) show that we can estimate the multiplicative and additive batch effects with an integral - This integral is numerically computed through Monte Carlo inegration (iterative method) - - Arguments: - sdat {matrix} -- data matrix - g_hat {matrix} -- average additive batch effect - d_hat {matrix} -- average multiplicative batch effect - precision {float} -- level of precision for precision computing - - Returns: - array list -- estimated additive and multiplicative batch effect - """ - g_star = [] - d_star = [] - # use this variable to only print error message once if approximation used - test_approximation = 0 - for i in range(len(sdat)): - # additive batch effect - g = np.asarray(np.delete(np.transpose(g_hat), i)) - # multiplicative batch effect - d = np.asarray(np.delete(np.transpose(d_hat), i)) - x = np.asarray(np.transpose(sdat[i])) - n = len(x) - j = [1]*n - dat = np.repeat(x, len(np.transpose(g)), axis=1) - resid2 = np.square(dat-g) - sum2 = np.dot(np.transpose(resid2), j) - # /begin{handling high precision computing} - temp_2d = 2*d - if (precision == None): - LH = np.power(1/(np.pi*temp_2d), n/2)*np.exp(np.negative(sum2)/(temp_2d)) - - else: # only if precision parameter informed - # increase the precision of the computing (if negative exponential too close to 0) - mp.dps = precision - buf_exp = list(map(mp.exp, np.negative(sum2)/(temp_2d))) - buf_pow = list(map(partial(mp.power, y=n/2), 1/(np.pi*temp_2d))) - LH = buf_pow*buf_exp # likelihood - # /end{handling high precision computing} - LH = np.nan_to_num(LH) # corrects NaNs in likelihood - if np.sum(LH) == 0 and test_approximation == 0: # correction for LH full of 0.0 - test_approximation = 1 # this message won't appear again - print("###\nValues too small, approximation applied to avoid division by 0.\nPrecision mode can correct this problem, but increases computation time.\n###") - LH[LH == 0] = np.exp(-745) - g_star.append(np.sum(g*LH)/np.sum(LH)) - d_star.append(np.sum(d*LH)/np.sum(LH)) - else: - g_star.append(np.sum(g*LH)/np.sum(LH)) - d_star.append(np.sum(d*LH)/np.sum(LH)) - adjust = np.asarray([np.asarray(g_star), np.asarray(d_star)]) - return(adjust) - - -def param_fun(i, s_data, batches, mean_only, gamma_hat, gamma_bar, delta_hat, t2, a_prior, b_prior): - """parametric estimation of batch effects - - Arguments: - i {int} -- column index - s_data {matrix} -- - batches {list list} -- list of list of batches' elements - mean_only {bool} -- True iff mean_only selected - gamma_hat {matrix} -- average additive batch effect - gamma_bar {matrix} -- estimated additive batch effect - delta_hat {matrix} -- average multiplicative batch effect - t2 {matrix} -- - a_prior {float} -- aprior - b_prior {float} -- bprior - - Returns: - array list -- estimated adjusted additive and multiplicative batch effect - """ - if mean_only: # if mean_only, no need for complex method: batch effect is immediately calculated - t2_n = np.multiply(t2[i], 1) - t2_n_g_hat = np.multiply(t2_n, gamma_hat[i]) - gamma_star = postmean(gamma_bar[i], 1, t2_n, t2_n_g_hat) # additive batch effect - delta_star = [1]*len(s_data) # multiplicative batch effect - else: # if not(mean_only) then use it_solve - temp = it_sol(np.transpose(np.transpose(s_data)[ - batches[i]]), gamma_hat[i], delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i]) - gamma_star = temp[0] # additive batch effect - delta_star = temp[1] # multiplicative batch effect - return [gamma_star, delta_star] - -def nonparam_fun(i, mean_only, delta_hat, s_data, batches, gamma_hat, precision): - """non-parametric estimation - - Arguments: - i {int} -- column index - mean_only {bool} -- True iff mean_only selected - delta_hat {matrix} -- estimated multiplicative batch effect - s_data {matrix} -- - batches {list list} -- list of list of batches' elements - gamma_hat {matrix} -- estimated additive batch effect - precision {float} -- level of precision for precision computing - - Returns: - array list -- estimated adjusted additive and multiplicative batch effect - """ - if mean_only: # if mean only, change delta_hat to vector of 1s - delta_hat[i] = [1]*len(delta_hat[i]) - # use int_eprior for non-parametric estimation - temp = int_eprior(np.transpose(np.transpose(s_data)[ - batches[i]]), gamma_hat[i], delta_hat[i], precision) - return [temp[0], temp[1]] - -############ -# pyComBat # -############ - - -def check_mean_only(mean_only): - """checks mean_only option - - Arguments: - mean_only {boolean} -- user's choice about mean_only - - Returns: - () - """ - if mean_only == True: - print("Using mean only version") - - -def define_batchmod(batch): - """generates model matrix - - Arguments: - batch {list} -- list of batch id - - Returns: - batchmod {matrix} -- model matrix for batches - """ - batchmod = model_matrix(list(batch), intercept=False, drop_first=False) - return(batchmod) - - -def check_ref_batch(ref_batch, batch, batchmod): - """check ref_batch option and treat it if needed - - Arguments: - ref_batch {int} -- the reference batch - batch {list} -- list of batch id - batchmod {matrix} -- model matrix related to batches - - Returns: - ref {int list} -- the corresponding positions of the reference batch in the batch list - batchmod {matrix} -- updated model matrix related to batches, with reference - """ - if ref_batch is not None: - if ref_batch not in batch: - print("Reference level ref.batch must be one of the levels of batch.") - exit(0) - print("Using batch "+str(ref_batch) + - " as a reference batch.") - # ref keeps in memory the columns concerned by the reference batch - ref = np.where(np.unique(batch) == ref_batch)[0][0] - # updates batchmod with reference - batchmod[:,ref] = 1 - else: - ref = None # default settings - return(ref, batchmod) - - -def treat_batches(batch): - """treat batches - - Arguments: - batch {list} -- batch list - - Returns: - n_batch {int} -- number of batches - batches {int list} -- list of unique batches - n_batches {int list} -- list of batches lengths - n_array {int} -- total size of dataset - """ - n_batch = len(np.unique(batch)) # number of batches - print("Found "+str(n_batch)+" batches.") - batches = [] # list of lists, contains the list of position for each batch - for i in range(n_batch): - batches.append(np.where(batch == np.unique(batch)[i])[0]) - batches = np.asarray(batches) - n_batches = list(map(len, batches)) - if 1 in n_batches: - #mean_only = True # no variance if only one sample in a batch - mean_only has to be used - print("\nOne batch has only one sample, try setting mean_only=True.\n") - n_array = sum(n_batches) - return(n_batch, batches, n_batches, n_array) - - -def treat_covariates(batchmod, mod, ref, n_batch): - """treat covariates - - Arguments: - batchmod {matrix} -- model matrix for batch - mod {matrix} -- model matrix for other covariates - ref {int} -- reference batch - n_batch {int} -- number of batches - - Returns: - check {bool list} -- a list characterising all covariates - design {matrix} -- model matrix for all covariates, including batch - """ - # design matrix for sample conditions - if mod == []: - design = batchmod - else: - mod_matrix = model_matrix(mod, intercept=True) - design = np.concatenate((batchmod, mod_matrix), axis=1) - check = list(map(all_1, np.transpose(design))) - if ref is not None: # if ref - check[ref] = False # the reference in not considered as a covariate - design = design[:, ~np.array(check)] - design = np.transpose(design) - - print("Adjusting for "+str(len(design)-len(np.transpose(batchmod))) + - " covariate(s) or covariate level(s).") - - # if matrix cannot be invertible, different cases - if np.linalg.matrix_rank(design) < len(design): - if len(design) == n_batch + 1: # case 1: covariate confunded with a batch - print( - "The covariate is confunded with batch. Remove the covariate and rerun pyComBat.") - exit(0) - if len(design) > n_batch + 1: # case 2: multiple covariates confunded with a batch - if np.linalg.matrix_rank(np.transpose(design)[:n_batch]) < len(design): - print( - "The covariates are confounded! Please remove one or more of the covariates so the design is not confounded.") - exit(0) - else: # case 3: at least a covariate confunded with a batch - print( - "At least one covariate is confounded with batch. Please remove confounded covariates and rerun pyComBat") - exit(0) - return(design) - - -def check_NAs(dat): - """check if NaNs - in theory, we construct the data without NAs - - Arguments: - dat {matrix} -- the data matrix - - Returns: - NAs {bool} -- boolean characterising the presence of NaNs in the data matrix - """ - # NAs = True in (np.isnan(dat)) - NAs = np.isnan(np.sum(dat)) # Check if NaN exists - if NAs: - print("Found missing data values. Please remove all missing values before proceeding with pyComBat.") - return(NAs) - - -def calculate_mean_var(design, batches, ref, dat, NAs, ref_batch, n_batches, n_batch, n_array): - """ calculates the Normalisation factors - - Arguments: - design {matrix} -- model matrix for all covariates - batches {int list} -- list of unique batches - dat {matrix} -- data matrix - NAs {bool} -- presence of NaNs in the data matrix - ref_batch {int} -- reference batch - n_batches {int list} -- list of batches lengths - n_array {int} -- total size of dataset +from sklearn.base import BaseEstimator, TransformerMixin + +from .utils.combat_utils import ( + model_matrix, + check_ref_batch, + treat_covariates, + treat_batches, + calculate_mean_var, + calculate_stand_mean, + standardise_data, + fit_model, +) +from .utils.common_utils import ( + check_NAs, + check_mean_only, +) + + +class PyCombat(BaseEstimator, TransformerMixin): + def __init__(self): + """ + Corrects batch effect in microarray expression data. + Takes an gene expression file and a list of known batches corresponding to each sample. + """ + self.ref = None + + self.ref_batch = None + + self.gamma_star = None + self.delta_star = None + self.batch_design = None + self.n_batches = None + self.var_pooled = None + self.stand_mean = None + self.n_array = None + self.ref_batch = None + self.ref = None + self.batches = None + + def check_if_fitted(self): + error_message = "is not initialized, run .fit_transform() or .fit()" + + assert self.gamma_star is not None, f"gamma_star (estimated additive batch effect) {error_message}" + assert self.delta_star is not None, f"delta_star (estimated multiplicative batch effect) {error_message}" + assert self.batch_design is not None, f"batch_design (information about batches in design matrix) {error_message}" + assert self.n_batches is not None, f"n_batches (list of batches lengths ) {error_message}" + assert self.var_pooled is not None, f"var_pooled (standardised mean) {error_message}" + assert self.stand_mean is not None, f"stand_mean (Variance for each gene and each batch) {error_message}" + assert self.n_array is not None, f"n_array (total size of dataset) {error_message}" + assert self.batches is not None, f"batches (list of unique batches) {error_message}" + + def fit(self, data: pd.DataFrame, batch: list, mod: list = [], par_prior: bool = True, prior_plots: bool = False, mean_only: bool = False, ref_batch: int = None, precision: float = None) -> PyCombat: + """ + Initialized the important variables for the batch effect correction + + Args: + data (pd.DataFrame): The expression matrix (dataframe). It contains the information about the gene expression (rows) for each sample (columns). + batch (list): List of batch indexes. The batch list describes the batch for each sample. The batches list has as many elements as the number of columns in the expression matrix. + mod (list, optional): List (or list of lists) of covariate(s) indexes. The mod list describes the covariate(s) for each sample. Each mod list has as many elements as the number of columns in the expression matrix. Defaults to []. + par_prior (bool, optional): False for non-parametric estimation of batch effects. Defaults to True. + prior_plots (bool, optional): True if requires to plot the priors. Defaults to False. + mean_only (bool, optional): True if just adjusting the means and not individual batch effects. Defaults to False. + ref_batch (int, optional): reference batch selected. Defaults to None. + precision (float, optional): level of precision for precision computing. Defaults to None. + + Raises: + ValueError: Error will be trigerred if NaN is found in the data + + Returns: + PyCombat: The fitted object with all needed components for adjusting the data + """ + self.list_samples = data.columns + self.list_genes = data.index + self.data_value = data.values + + self.ref_batch = ref_batch - Returns: - B_hat {matrix} -- regression coefficients corresponding to the design matrix - grand_mean {matrix} -- Mean for each gene and each batch - var_pooled {matrix} -- Variance for each gene and each batch - """ - print("Standardizing Data across genes.") - if not(NAs): # NAs not supported - # B_hat is the vector of regression coefficients corresponding to the design matrix - B_hat = np.linalg.solve(np.dot(design, np.transpose( - design)), np.dot(design, np.transpose(dat))) + check_mean_only(mean_only) - # Calculates the general mean - if ref_batch is not None: - grand_mean = np.transpose(B_hat[ref]) - else: - grand_mean = np.dot(np.transpose( - [i / n_array for i in n_batches]), B_hat[0:n_batch]) - # Calculates the general variance - if not NAs: # NAs not supported - if ref_batch is not None: # depending on ref batch - ref_dat = np.transpose(np.transpose(dat)[batches[ref]]) - var_pooled = np.dot(np.square(ref_dat - np.transpose(np.dot(np.transpose( - design)[batches[ref]], B_hat))), [1/n_batches[ref]]*n_batches[ref]) + # Generate model matrix + batchmod = model_matrix(list(batch), intercept=False, drop_first=False) + + # self.ref (List[int]): the corresponding positions of the reference batch in the batch list + self.ref, batchmod = check_ref_batch(self.ref_batch, batch, batchmod) + + # self.batches (List[int]): list of unique batches + # self.n_batches (List[int]): list of batches lengths + # self.n_array (int): total size of dataset + n_batch, self.batches, self.n_batches, self.n_array = treat_batches(batch) + + design = treat_covariates(batchmod, mod, self.ref, n_batch) + + NAs = check_NAs(self.data_value) + if not NAs: + B_hat, grand_mean, self.var_pooled = calculate_mean_var(design, self.batches, self.ref, self.data_value, NAs, self.ref_batch, self.n_batches, n_batch, self.n_array) + self.stand_mean = calculate_stand_mean(grand_mean, self.n_array, design, n_batch, B_hat) + self.standardised_data = standardise_data(self.data_value, self.stand_mean, self.var_pooled, self.n_array) + + # self.gamma_star (np.array): estimated additive batch effect + # self.delta_star (np.array): estimated multiplicative batch effect + # self.batch_design (np.array): information about batches in design matrix + self.gamma_star, self.delta_star, self.batch_design = fit_model(design, n_batch, self.standardised_data, self.batches, mean_only, par_prior, precision, self.ref_batch, self.ref, NAs) else: - var_pooled = np.dot(np.square( - dat - np.transpose(np.dot(np.transpose(design), B_hat))), [1/n_array]*n_array) - - return(B_hat, grand_mean, var_pooled) - - -def calculate_stand_mean(grand_mean, n_array, design, n_batch, B_hat): - """ transform the format of the mean for substraction - - Arguments: - grand_mean {matrix} -- Mean for each gene and each batch - n_array {int} -- total size of dataset - design {[type]} -- design matrix for all covariates including batch - n_batch {int} -- number of batches - B_hat {matrix} -- regression coefficients corresponding to the design matrix - - Returns: - stand_mean {matrix} -- standardised mean - """ - stand_mean = np.dot(np.transpose(np.mat(grand_mean)), np.mat([1]*n_array)) - # corrects the mean with design matrix information - if design is not None: - tmp = np.ndarray.copy(design) - tmp[0:n_batch] = 0 - stand_mean = stand_mean + \ - np.transpose(np.dot(np.transpose(tmp), B_hat)) - return(stand_mean) - - -def standardise_data(dat, stand_mean, var_pooled, n_array): - """standardise the data: substract mean and divide by variance - - Arguments: - dat {matrix} -- data matrix - stand_mean {matrix} -- standardised mean - var_pooled {matrix} -- Variance for each gene and each batch - n_array {int} -- total size of dataset - - Returns: - s_data {matrix} -- standardised data matrix - """ - s_data = (dat - stand_mean) / \ - np.dot(np.transpose(np.mat(np.sqrt(var_pooled))), np.mat([1]*n_array)) - return(s_data) + raise ValueError("NaN value is not accepted") + return self -def fit_model(design, n_batch, s_data, batches, mean_only, par_prior, precision, ref_batch, ref, NAs): - print("Fitting L/S model and finding priors.") + def transform(self, data: pd.DataFrame) -> pd.DataFrame: + """ + Adjust the data, correct the data given the estimated batch effects - # fraction of design matrix related to batches - batch_design = design[0:n_batch] + Args: + data (pd.DataFrame): The expression matrix (dataframe). It contains the information about the gene expression (rows) for each sample (columns). - if not NAs: # CF SUPRA FOR NAs - # gamma_hat is the vector of additive batch effect - gamma_hat = np.linalg.solve(np.dot(batch_design, np.transpose(batch_design)), - np.dot(batch_design, np.transpose(s_data))) + Returns: + pd.DataFrame: The expression dataframe adjusted given the batch effects. + """ + # Check if the fit function has been initiated before transform + self.check_if_fitted() - delta_hat = [] # delta_hat is the vector of estimated multiplicative batch effect + print("Adjusting the Data") + bayes_data = np.transpose(self.standardised_data) + for i, batch_index in enumerate(self.batches): # for each batch, specific correction + bayes_data[batch_index] = (bayes_data[batch_index] - np.dot(np.transpose(self.batch_design)[batch_index], self.gamma_star)) / np.transpose(np.outer(np.sqrt(self.delta_star[i]), np.asarray([1]*self.n_batches[i]))) - if (mean_only): - # no variance if mean_only == True - delta_hat = [np.asarray([1]*len(s_data))] * len(batches) - else: - for i in batches: # feed incrementally delta_hat - list_map = np.transpose(np.transpose(s_data)[i]).var( - axis=1) # variance for each row - delta_hat.append(np.squeeze(np.asarray(list_map))) + # renormalise the data after correction: + # 1. multiply by variance + # 2. add mean + bayes_data = np.multiply(np.transpose(bayes_data), np.outer(np.sqrt(self.var_pooled), np.asarray([1]*self.n_array))) + self.stand_mean - gamma_bar = list(map(np.mean, gamma_hat)) # vector of means for gamma_hat - t2 = list(map(np.var, gamma_hat)) # vector of variances for gamma_hat - - # calculates hyper priors for gamma (additive batch effect) - a_prior = list( - map(partial(compute_prior, 'a', mean_only=mean_only), delta_hat)) - b_prior = list( - map(partial(compute_prior, 'b', mean_only=mean_only), delta_hat)) - - # initialise gamma and delta for parameters estimation - gamma_star = np.empty((n_batch, len(s_data))) - delta_star = np.empty((n_batch, len(s_data))) - - if par_prior: - # use param_fun function for parametric adjustments (cf. function definition) - print("Finding parametric adjustments.") - results = list(map(partial(param_fun, - s_data=s_data, - batches=batches, - mean_only=mean_only, - gamma_hat=gamma_hat, - gamma_bar=gamma_bar, - delta_hat=delta_hat, - t2=t2, - a_prior=a_prior, - b_prior=b_prior), range(n_batch))) - else: - # use nonparam_fun for non-parametric adjustments (cf. function definition) - print("Finding nonparametric adjustments") - results = list(map(partial(nonparam_fun, mean_only=mean_only, delta_hat=delta_hat, - s_data=s_data, batches=batches, gamma_hat=gamma_hat, precision=precision), range(n_batch))) - - for i in range(n_batch): # store the results in gamma/delta_star - results_i = results[i] - gamma_star[i], delta_star[i] = results_i[0], results_i[1] - - # update if reference batch (the reference batch is not supposed to be modified) - if ref_batch: - len_gamma_star_ref = len(gamma_star[ref]) - gamma_star[ref] = [0] * len_gamma_star_ref - delta_star[ref] = [1] * len_gamma_star_ref - - return(gamma_star, delta_star, batch_design) - - -def adjust_data(s_data, gamma_star, delta_star, batch_design, n_batches, var_pooled, stand_mean, n_array, ref_batch, ref, batches, dat): - """Adjust the data -- corrects for estimated batch effects - - Arguments: - s_data {matrix} -- standardised data matrix - gamma_star {matrix} -- estimated additive batch effect - delta_star {matrix} -- estimated multiplicative batch effect - batch_design {matrix} -- information about batches in design matrix - n_batches {int list} -- list of batches lengths - stand_mean {matrix} -- standardised mean - var_pooled {matrix} -- Variance for each gene and each batch - n_array {int} -- total size of dataset - ref_batch {int} -- reference batch - ref {int list} -- the corresponding positions of the reference batch in the batch list - batches {int list} -- list of unique batches - dat - - Returns: - bayes_data [matrix] -- data adjusted for correction of batch effects - """ - # Now we adjust the data: - # 1. substract additive batch effect (gamma_star) - # 2. divide by multiplicative batch effect (delta_star) - print("Adjusting the Data") - bayes_data = np.transpose(s_data) - j = 0 - for i in batches: # for each batch, specific correction - bayes_data[i] = (bayes_data[i] - np.dot(np.transpose(batch_design)[i], gamma_star)) / \ - np.transpose( - np.outer(np.sqrt(delta_star[j]), np.asarray([1]*n_batches[j]))) - j += 1 - - # renormalise the data after correction: - # 1. multiply by variance - # 2. add mean - bayes_data = np.multiply(np.transpose(bayes_data), np.outer( - np.sqrt(var_pooled), np.asarray([1]*n_array))) + stand_mean - - # correction for reference batch - if ref_batch: - bayes_data[batches[ref]] = dat[batches[ref]] - - # returns the data corrected for batch effects - return bayes_data - - -def pycombat(data, batch, mod=[], par_prior=True, prior_plots=False, mean_only=False, ref_batch=None, precision=None, **kwargs): - """Corrects batch effect in microarray expression data. Takes an gene expression file and a list of known batches corresponding to each sample. - - Arguments: - data {matrix} -- The expression matrix (dataframe). It contains the information about the gene expression (rows) for each sample (columns). - - batch {list} -- List of batch indexes. The batch list describes the batch for each sample. The batches list has as many elements as the number of columns in the expression matrix. - - Keyword Arguments: - mod {list} -- List (or list of lists) of covariate(s) indexes. The mod list describes the covariate(s) for each sample. Each mod list has as many elements as the number of columns in the expression matrix (default: {[]}). - - par_prior {bool} -- False for non-parametric estimation of batch effects (default: {True}). - - prior_plots {bool} -- True if requires to plot the priors (default: {False} -- Not implemented yet!). - - mean_only {bool} -- True iff just adjusting the means and not individual batch effects (default: {False}). - - ref_batch {int} -- reference batch selected (default: {None}). - - precision {float} -- level of precision for precision computing (default: {None}). - - Returns: - bayes_data_df -- The expression dataframe adjusted for batch effects. - """ - - list_samples = data.columns - list_genes = data.index - dat = data.values - - check_mean_only(mean_only) - - batchmod = define_batchmod(batch) - ref, batchmod = check_ref_batch(ref_batch, batch, batchmod) - n_batch, batches, n_batches, n_array = treat_batches(batch) - design = treat_covariates(batchmod, mod, ref, n_batch) - NAs = check_NAs(dat) - if not(NAs): - B_hat, grand_mean, var_pooled = calculate_mean_var( - design, batches, ref, dat, NAs, ref_batch, n_batches, n_batch, n_array) - stand_mean = calculate_stand_mean( - grand_mean, n_array, design, n_batch, B_hat) - s_data = standardise_data(dat, stand_mean, var_pooled, n_array) - gamma_star, delta_star, batch_design = fit_model( - design, n_batch, s_data, batches, mean_only, par_prior, precision, ref_batch, ref, NAs) - bayes_data = adjust_data(s_data, gamma_star, delta_star, batch_design, - n_batches, var_pooled, stand_mean, n_array, ref_batch, ref, batches, dat) - - bayes_data_df = pd.DataFrame(bayes_data, - columns = list_samples, - index = list_genes) + # correction for reference batch + if self.ref_batch: + bayes_data[self.batches[self.ref]] = self.data_value[self.batches[self.ref]] - return(bayes_data_df) - else: - raise ValueError("NaN value is not accepted") \ No newline at end of file + # returns the data corrected for batch effects + return pd.DataFrame(bayes_data, columns=self.list_samples, index=self.list_genes) diff --git a/combat/test_unit.py b/combat/test_unit.py deleted file mode 100644 index 00a1dfb..0000000 --- a/combat/test_unit.py +++ /dev/null @@ -1,211 +0,0 @@ -#----------------------------------------------------------------------------- -# Copyright (C) 2019-2020 A. Behdenna, A. Nordor, J. Haziza and A. Gema - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# For more information, please contact Abdelkader Behdenna / - -# file pycombat.py -# author A. Behdenna, J. Haziza, A. Gema, A. Nordor -# date Sept 2020 -#----------------------------------------------------------------------------- - - - - -# this file is only used for unit testing -# We import the function that will be tested one by one, incrementally -# Generates a report about the functions tested - -import numpy as np -import pandas as pd -#from patsy import dmatrix - -########## -# import function used for unit testing -from .pycombat import model_matrix, all_1 -#covariate_model_matrix -from .pycombat import compute_prior, postmean, postvar, it_sol, int_eprior -from .pycombat import check_mean_only, define_batchmod, check_ref_batch, treat_batches, treat_covariates, check_NAs -from .pycombat import calculate_mean_var, calculate_stand_mean -from .pycombat import standardise_data, fit_model, adjust_data -from .pycombat import pycombat - -########## -print("\n#### Unit Testing for pyComBat ####\n") - -########## -# Define constants for unit testing - -batch = np.asarray([1,1,1,2,2,3,3,3,3]) -# matrix = np.transpose(np.asmatrix([np.random.normal(size=1000,loc=3,scale=1),np.random.normal(size=1000,loc=3,scale=1),np.random.normal(size=1000,loc=3,scale=1), -# np.random.normal(size=1000,loc=2,scale=0.6),np.random.normal(size=1000,loc=2,scale=0.6), -# np.random.normal(size=1000,loc=4,scale=1),np.random.normal(size=1000,loc=4,scale=1),np.random.normal(size=1000,loc=4,scale=1),np.random.normal(size=1000,loc=4,scale=1)])) -matrix = np.transpose([np.random.normal(size=1000,loc=3,scale=1),np.random.normal(size=1000,loc=3,scale=1),np.random.normal(size=1000,loc=3,scale=1), - np.random.normal(size=1000,loc=2,scale=0.6),np.random.normal(size=1000,loc=2,scale=0.6), - np.random.normal(size=1000,loc=4,scale=1),np.random.normal(size=1000,loc=4,scale=1),np.random.normal(size=1000,loc=4,scale=1),np.random.normal(size=1000,loc=4,scale=1)]) - -matrix = pd.DataFrame(data=matrix,columns=["sample_"+str(i+1) for i in range(9)],index=["gene_"+str(i+1) for i in range(1000)]) - -print("Matrix and batch generated.") - -matrix_adjusted = pycombat(matrix,batch) - -list_samples = matrix_adjusted.columns -list_genes = matrix_adjusted.index - -print("Adjusted matrix generated.") - -########## -# local tests before unit testing - -def test_means(): - print("mean matrix:",np.mean(matrix)) - print("mean matrix (adjusted):",np.mean(matrix_adjusted)) - print("**********") - print("var matrix:",np.var(matrix)) - print("var matrix (adjusted):",np.var(matrix_adjusted)) - -########### -# useful constants for unit testing -ref_batch = None -mean_only = False -par_prior = False -precision = None -mod = [] -dat = matrix.values -#batchmod = define_batchmod(batch) -batchmod = model_matrix(list(batch), intercept=False, drop_first=False) -ref,batchmod = check_ref_batch(ref_batch,batch,batchmod) -n_batch, batches, n_batches, n_array = treat_batches(batch) -design = treat_covariates(batchmod, mod, ref, n_batch) -NAs = check_NAs(dat) -B_hat, grand_mean, var_pooled = calculate_mean_var(design, batches, ref, dat, NAs, ref_batch, n_batches, n_batch, n_array) -stand_mean = calculate_stand_mean(grand_mean, n_array, design, n_batch,B_hat) -s_data = standardise_data(dat, stand_mean, var_pooled, n_array) -gamma_star, delta_star, batch_design = fit_model(design,n_batch,s_data, batches, mean_only, par_prior, precision, ref_batch, ref, NAs) -bayes_data = adjust_data(s_data, gamma_star, delta_star, batch_design, n_batches, var_pooled, stand_mean, n_array, ref_batch, ref, batches, dat) - - -########## -# Unit tests - -# test for compute_prior -def test_compute_prior(): - print("aprior",compute_prior("a",gamma_star,False)) - assert compute_prior("a",gamma_star,True) == 1 - print("bprior",compute_prior("b",gamma_star,False)) - assert compute_prior("b",gamma_star,True) == 1 - -# test for postmean -def test_postmean(): - assert np.shape(postmean(gamma_star,delta_star,gamma_star,delta_star)) == np.shape(gamma_star) - -# test for postvar -def test_postvar(): - assert np.sum(postvar([2,4,6],2,1,1) - [2,3,4]) == 0 - -# test for it_sol -def test_it_sol(): - () - -# test for int_eprior -def test_int_eprior(): - () - -# test for model_matrix -def test_model_matrix(): - model_matrix_test = model_matrix([1,1,0,1,0]) - assert np.shape(model_matrix_test) == (5,2) - assert list(model_matrix_test[0]) == [1.0,1.0] - -# tests for all_1 function -def test_all_1(): - assert all_1(np.array([1,1,1,1,1])) == True - - assert all_1(np.array([1,1,1,1,0])) == False - assert all_1(np.array([0,0,0,0,0])) == False - - assert all_1(np.array([1.5,0.5,1,1,1])) == False # This test to show the limit of the method we use - - -# test for check_mean_only -def test_check_mean_only(): - check_mean_only(True) - check_mean_only(False) - print("Only one line of text should have been printed above.") - -# test for define_batchmode -def test_define_batchmod(): - assert np.shape(define_batchmod(batch)) == (9,3) - -# test for check_ref_batch -def test_check_ref_batch(): - assert check_ref_batch(1,batch,batchmod) == (0, batchmod) - assert check_ref_batch(2,batch,batchmod) == (1, batchmod) - print("Using batch 1 then 2. Above lines should inform on that.") - assert check_ref_batch(None,batch,batchmod) == (None, batchmod) - -# test for treat_batches -def test_treat_batches(): - assert n_batch == 3 - assert batches[0].tolist() == [0,1,2] - assert batches[1].tolist() == [3,4] - assert batches[2].tolist() == [5,6,7,8] - assert n_batches == [3,2,4] - assert n_array == 9 - -# test for treat_covariates -def test_treat_covariates(): - batchmod = define_batchmod(batch) - assert np.sum(design - np.transpose(batchmod)) == 0 - -# test for check_NAs -def test_check_NAs(): - assert check_NAs([0,1,2]) == False - -# test for calculate_mean_var -def test_calculate_mean_var(): - assert np.shape(B_hat)[0] == np.shape(design)[0] - assert np.shape(grand_mean)[0] == np.shape(dat)[0] - assert np.shape(var_pooled)[0] == np.shape(dat)[0] - -# test for calculate_stand_mean -def test_calculate_stand_mean(): - assert np.shape(stand_mean) == np.shape(dat) - -# test for standardise_data -def test_standardise_data(): - assert np.shape(s_data) == np.shape(dat) - -# test for fit_model -def test_fit_model(): - assert np.shape(gamma_star)[1] == np.shape(dat)[0] - assert np.shape(delta_star)[1] == np.shape(dat)[0] - assert np.shape(batch_design) == np.shape(design) - -#test for adjust_data -def test_adjust_data(): - assert np.shape(bayes_data) == np.shape(dat) - -# general tests on pyComBat -def test_pycombat(): - assert np.shape(matrix) == np.shape(matrix_adjusted) - assert abs(np.mean(matrix.values)-np.mean(matrix_adjusted.values)) <= np.mean(matrix.values)*0.05 - assert np.var(matrix_adjusted.values) <= np.var(matrix.values) - -if __name__ == '__main__': - test_means() - print("\n*** UNIT TESTING ***\n") - # unittest.main() \ No newline at end of file diff --git a/combat/utils/__init__.py b/combat/utils/__init__.py new file mode 100644 index 0000000..d6014f4 --- /dev/null +++ b/combat/utils/__init__.py @@ -0,0 +1,2 @@ +from .combat_utils import * +from .common_utils import * \ No newline at end of file diff --git a/combat/utils/combat_utils.py b/combat/utils/combat_utils.py new file mode 100644 index 0000000..18b25a2 --- /dev/null +++ b/combat/utils/combat_utils.py @@ -0,0 +1,547 @@ +from typing_extensions import ( + Literal, + Tuple +) +from typing import List, Optional + +import numpy as np +from functools import partial +import mpmath as mp +import pandas as pd + +from .common_utils import check_all_ones + + +def model_matrix(info: list, intercept: bool = True, drop_first: bool = True) -> np.array: + """ + Creates the model_matrix from batch list + + Args: + info (list): list info with batch or covariates data + intercept (bool, optional): boolean for intercept in model matrix. Defaults to True. + drop_first (bool, optional): boolean for drop the first row. Defaults to True. + + Returns: + np.array: Model matrix generate from batch list + """ + if not isinstance(info[0],list) : + info = [info] + else: + info = info + info_dict = {} + for i in range(len(info)): + info_dict[f"col{str(i)}"] = list(map(str,info[i])) + df = pd.get_dummies(pd.DataFrame(info_dict), drop_first=drop_first, dtype=float) + if intercept: + df["intercept"] = 1.0 + return df.to_numpy() + + +def compute_prior(prior: Literal["a", "b"], gamma_hat: np.array, mean_only: bool) -> float: + """ + aprior and bprior are useful to compute "hyper-prior values" + -> prior parameters used to estimate the prior gamma distribution for multiplicative batch effect + aprior - calculates empirical hyper-prior values + + Arguments: + prior {char} -- 'a' or 'b' depending of the prior to be calculated + gamma_hat {matrix} -- matrix of additive batch effect + mean_only {bool} -- True iff mean_only selected + + Returns: + float -- [the prior calculated (aprior or bprior) + + Args: + prior (str): 'a' or 'b' depending of the prior to be calculated + gamma_hat (np.array): [description] + mean_only (bool): [description] + + Returns: + float: [description] + """ + if mean_only: + return 1 + + m = np.mean(gamma_hat) + s2 = np.var(gamma_hat) + + if prior == 'a': + calculated_prior = (2*s2+m*m)/s2 + elif prior == 'b': + calculated_prior = (m*s2+m*m*m)/s2 + + return calculated_prior + + +def postmean(g_bar: np.array, d_star: np.array, t2_n: np.array, t2_n_g_hat: np.array) -> np.array: + """ + Estimates additive batch effect + + Args: + g_bar (np.array): matrix of additive batch effect + d_star (np.array): matrix of multiplicative batch effect + t2_n (np.array): [description] + t2_n_g_hat (np.array): [description] + + Returns: + np.array: matrix of estimated additive batch effect + """ + + return np.divide(t2_n_g_hat + d_star * g_bar, np.asarray(t2_n + d_star)) + + +def postvar(sum2: np.array, n: float, a: float, b: float) -> np.array: + """ + Estimates multiplicative batch effect + + Args: + sum2 (np.array): Vector + n (float): [description] + a (float): aprior + b (float): bprior + + Returns: + np.array: matrix of estimated multiplicative batch effect + """ + return(np.divide((np.multiply(0.5, sum2)+b), (np.multiply(0.5, n)+a-1))) + + +def it_sol(sdat: np.array, g_hat: np.array, d_hat: np.array, g_bar: np.array, t2: np.array, a: float, b: float, conv: float = 0.0001, exit_iteration: float = 10e5) -> np.array: + """ + Iterative solution for Empirical Bayesian method + + Args: + sdat (np.array): data matrix + g_hat (np.array): matrix of the average additive batch effect + d_hat (np.array): matrix of the average multiplicative batch effect + g_bar (np.array): matrix of the additive batch effect + t2 (np.array): [description] + a (float): aprior + b (float): bprior + conv (float, optional): convergence criterion. Defaults to 0.0001. + exit_iteration (float, optional): maximum number of iterations before exit. Defaults to 10e5. + + Returns: + np.array: An array of array with the estimated additive and multiplicative batch effect + """ + + n = [len(i) for i in np.asarray(sdat)] + t2_n = np.multiply(t2, n) + t2_n_g_hat = np.multiply(t2_n, g_hat) + g_old = np.ndarray.copy(g_hat) + d_old = np.ndarray.copy(d_hat) + change = 1 + count = 0 # number of steps needed (for diagnostic only) + + # convergence criteria, if new-old < conv, then stop + while (change > conv) and (count < exit_iteration): + g_new = postmean(g_bar, d_old, t2_n, t2_n_g_hat) # updated additive batch effect + sum2 = np.sum(np.asarray(np.square( + sdat-np.outer(g_new[0][0], np.ones(np.ma.size(sdat, axis=1))))), axis=1) + d_new = postvar(sum2, n, a, b) # updated multiplicative batch effect + change = max(np.amax(np.absolute(g_new-np.asarray(g_old))/np.asarray(g_old)), np.amax( + np.absolute(d_new-d_old)/d_old)) # maximum difference between new and old estimate + g_old = np.ndarray.copy(g_new) # save value for g + d_old = np.ndarray.copy(d_new) # save value for d + count += 1 + adjust = np.asarray([g_new, d_new]) + + return adjust + + +def int_eprior(sdat: np.array, g_hat: np.array, d_hat: np.array, precision: float) -> np.array: + """ + int_eprior - Monte Carlo integration function to find nonparametric adjustments + Johnson et al (Biostatistics 2007, supp.mat.) show that we can estimate the multiplicative and additive batch effects with an integral + This integral is numerically computed through Monte Carlo inegration (iterative method) + + Args: + sdat (np.array): data matrix + g_hat (np.array): matrix of the average additive batch effect + d_hat (np.array): matrix of the average multiplicative batch effect + precision (float): level of precision for precision computing + + Returns: + np.array: An array of array with the estimated additive and multiplicative batch effect + """ + g_star = [] + d_star = [] + # use this variable to only print error message once if approximation used + test_approximation = 0 + for i in range(len(sdat)): + # additive batch effect + g = np.asarray(np.delete(np.transpose(g_hat), i)) + + # multiplicative batch effect + d = np.asarray(np.delete(np.transpose(d_hat), i)) + x = np.asarray(np.transpose(sdat[i])) + n = len(x) + j = [1]*n + dat = np.repeat(x, len(np.transpose(g)), axis=1) + resid2 = np.square(dat-g) + sum2 = np.dot(np.transpose(resid2), j) + + # /begin{handling high precision computing} + temp_2d = 2*d + if (precision == None): + LH = np.power(1 / (np.pi * temp_2d), n / 2) * np.exp(np.negative(sum2) / (temp_2d)) + else: + # only if precision parameter informed + # increase the precision of the computing (if negative exponential too close to 0) + mp.dps = precision + buf_exp = list(map(mp.exp, np.negative(sum2)/(temp_2d))) + buf_pow = list(map(partial(mp.power, y=n/2), 1/(np.pi*temp_2d))) + LH = buf_pow * buf_exp # likelihood + # /end{handling high precision computing} + LH = np.nan_to_num(LH) # corrects NaNs in likelihood + if np.sum(LH) == 0 and test_approximation == 0: # correction for LH full of 0.0 + test_approximation = 1 # this message won't appear again + print("###\nValues too small, approximation applied to avoid division by 0.\nPrecision mode can correct this problem, but increases computation time.\n###") + LH[LH == 0] = np.exp(-745) + g_star.append(np.sum(g * LH) / np.sum(LH)) + d_star.append(np.sum(d * LH) / np.sum(LH)) + else: + g_star.append(np.sum(g * LH) / np.sum(LH)) + d_star.append(np.sum(d * LH) / np.sum(LH)) + adjust = np.asarray([np.asarray(g_star), np.asarray(d_star)]) + + return adjust + + +def param_fun(i: int, standardised_data: np.array, batches: List[list], mean_only: bool, gamma_hat: np.array, gamma_bar: np.array, delta_hat: np.array, t2: np.array, a_prior: float, b_prior: float) -> Tuple[np.array, np.array]: + """ + Parametric estimation of batch effects + + Args: + i (int): column index + standardised_data (np.array): data matrix + batches (List[list]): list of list of batches' elements + mean_only (bool): True if mean_only selected + gamma_hat (np.array): average additive batch effect + gamma_bar (np.array): estimated additive batch effect + delta_hat (np.array): average multiplicative batch effect + t2 (np.array): [description] + a_prior (float): aprior + b_prior (float): bprior + + Returns: + Tuple[np.array, np.array]: estimated adjusted additive (gamma_star) and multiplicative (delta_star) batch effect + """ + if mean_only: # if mean_only, no need for complex method: batch effect is immediately calculated + t2_n = np.multiply(t2[i], 1) + t2_n_g_hat = np.multiply(t2_n, gamma_hat[i]) + gamma_star = postmean(gamma_bar[i], 1, t2_n, t2_n_g_hat) # additive batch effect + delta_star = [1]*len(standardised_data) # multiplicative batch effect + else: # if not(mean_only) then use it_solve + temp = it_sol(np.transpose(np.transpose(standardised_data)[ + batches[i]]), gamma_hat[i], delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i]) + gamma_star = temp[0] # additive batch effect + delta_star = temp[1] # multiplicative batch effect + return (gamma_star, delta_star) + + +# FIXME: Params order is not the same +def nonparam_fun(i: int, mean_only: bool, delta_hat: np.array, standardised_data: np.array, batches: List[list], gamma_hat: np.array, precision: float) -> Tuple[np.array, np.array]: + """ + Non-parametric estimation + + Args: + i (int): column index + mean_only (bool): True if mean_only selected + delta_hat (np.array): estimated multiplicative batch effect + standardised_data (np.array): data matrix + batches (List[list]): list of list of batches' elements + gamma_hat (np.array): estimated additive batch effect + precision (float): level of precision for precision computing + + Returns: + Tuple[np.array, np.array]: estimated adjusted additive and multiplicative batch effect + """ + if mean_only: # if mean only, change delta_hat to vector of 1s + delta_hat[i] = [1]*len(delta_hat[i]) + # use int_eprior for non-parametric estimation + temp = int_eprior(np.transpose(np.transpose(standardised_data)[ + batches[i]]), gamma_hat[i], delta_hat[i], precision) + return (temp[0], temp[1]) + + +def check_ref_batch(ref_batch: int, batch: List[int], batchmod: np.array) -> Tuple[Optional[List[int]], np.array]: + """ + check ref_batch option and treat it if needed + + Args: + ref_batch (int): the reference batch + batch (List[int]): list of batch id + batchmod (np.array): model matrix related to batches + + Raises: + ValueError: Reference level ref_batch must be one of the levels of batch + + Returns: + Tuple[Optional[List[int]], np.array]: + - the corresponding positions of the reference batch in the batch list + - updated model matrix related to batches, with reference + """ + if ref_batch is not None: + if ref_batch not in batch: + raise ValueError("Reference level ref_batch must be one of the levels of batch.") + + print(f"Using batch {str(ref_batch)} as a reference batch.") + # ref keeps in memory the columns concerned by the reference batch + ref = np.where(np.unique(batch) == ref_batch)[0][0] + # updates batchmod with reference + batchmod[:, ref] = 1 + else: + ref = None # default settings + + return (ref, batchmod) + + +def treat_covariates(batchmod: np.array, mod: np.array, ref: int, n_batch: int) -> np.array: + """ + treat covariates + + Args: + batchmod (np.array): model matrix for batch + mod (np.array): model matrix for other covariates + ref (int): index of reference batch + n_batch (int): number of batches + + Raises: + ValueError: The covariate is confunded with batch. Remove the covariate and rerun pyComBat. + ValueError: The covariates are confounded! Please remove one or more of the covariates so the design is not confounded. + ValueError: At least one covariate is confounded with batch. Please remove confounded covariates and rerun pyComBat. + + Returns: + np.array: model matrix for all covariates, including batch + """ + # design matrix for sample conditions + if mod == []: + design = batchmod + else: + mod_matrix = model_matrix(mod, intercept=True) + design = np.concatenate((batchmod, mod_matrix), axis=1) + check = list(map(check_all_ones, np.transpose(design))) + if ref is not None: + check[ref] = False # the reference in not considered as a covariate + design = design[:, ~np.array(check)] + design = np.transpose(design) + + print(f"Adjusting for {str(len(design)-len(np.transpose(batchmod)))} covariate(s) or covariate level(s).") + + # if matrix cannot be invertible, different cases + if np.linalg.matrix_rank(design) < len(design): + if len(design) == n_batch + 1: # case 1: covariate confunded with a batch + raise ValueError("The covariate is confunded with batch. Remove the covariate and rerun pyComBat.") + if len(design) > n_batch + 1: # case 2: multiple covariates confunded with a batch + if np.linalg.matrix_rank(np.transpose(design)[:n_batch]) < len(design): + raise ValueError("The covariates are confounded! Please remove one or more of the covariates so the design is not confounded.") + else: # case 3: at least a covariate confunded with a batch + raise ValueError("At least one covariate is confounded with batch. Please remove confounded covariates and rerun pyComBat") + return design + + +def treat_batches(batch: list) -> Tuple[int, List[int], List[int], int]: + """ + treat batches + + Args: + batch (list): batch list + + Returns: + Tuple[int, List[int], List[int], int]: + - number of batches + - list of unique batches + - list of batches lengths + - total size of dataset + """ + n_batch = len(np.unique(batch)) # number of batches + print(f"Found {n_batch} batches.") + + batches = [] # list of lists, contains the list of position for each batch + for i in range(n_batch): + batches.append(np.where(batch == np.unique(batch)[i])[0]) + batches = np.asarray(batches) + n_batches = list(map(len, batches)) + if 1 in n_batches: + #mean_only = True # no variance if only one sample in a batch - mean_only has to be used + print("\nOne batch has only one sample, try setting mean_only=True.\n") + n_array = sum(n_batches) + return (n_batch, batches, n_batches, n_array) + + +def calculate_mean_var(design: np.array, batches: List[int], ref: int, dat: np.array, NAs: bool, ref_batch: int, n_batches: List[int], n_batch: int, n_array: int) -> Tuple[np.array, np.array, np.array]: + """ + calculates the Normalisation factors + + Args: + design (np.array): model matrix for all covariates + batches (List[int]): list of unique batches + ref (int): index of reference batch + dat (np.array): data matrix + NAs (bool): presence of NaNs in the data matrix + ref_batch (int): reference batch + n_batches (List[int]): list of batches lengths + n_batch (int): number of batches + n_array (int): total size of dataset + + Returns: + Tuple[np.array, np.array, np.array]: + - regression coefficients corresponding to the design matrix + - Mean for each gene and each batch + - Variance for each gene and each batch + """ + print("Standardizing Data across genes.") + if not(NAs): # NAs not supported + # B_hat is the vector of regression coefficients corresponding to the design matrix + B_hat = np.linalg.solve(np.dot(design, np.transpose( + design)), np.dot(design, np.transpose(dat))) + + # Calculates the general mean + if ref_batch is not None: + grand_mean = np.transpose(B_hat[ref]) + else: + grand_mean = np.dot(np.transpose( + [i / n_array for i in n_batches]), B_hat[0:n_batch]) + + # Calculates the general variance + if not NAs: # NAs not supported + if ref_batch is not None: # depending on ref batch + ref_dat = np.transpose(np.transpose(dat)[batches[ref]]) + var_pooled = np.dot(np.square(ref_dat - np.transpose(np.dot(np.transpose( + design)[batches[ref]], B_hat))), [1/n_batches[ref]]*n_batches[ref]) + else: + var_pooled = np.dot(np.square( + dat - np.transpose(np.dot(np.transpose(design), B_hat))), [1/n_array]*n_array) + + return (B_hat, grand_mean, var_pooled) + + +def calculate_stand_mean(grand_mean: np.array, n_array: int, design: np.array, n_batch: int, B_hat: np.array) -> np.array: + """ + transform the format of the mean for substraction + + Args: + grand_mean (np.array): Mean for each gene and each batch + n_array (int): total size of dataset + design (np.array): design matrix for all covariates including batch + n_batch (int): number of batches + B_hat (np.array): regression coefficients corresponding to the design matrix + + Returns: + np.array: standardised mean + """ + stand_mean = np.dot(np.transpose(np.mat(grand_mean)), np.mat([1]*n_array)) + # corrects the mean with design matrix information + if design is not None: + tmp = np.ndarray.copy(design) + tmp[0:n_batch] = 0 + stand_mean = stand_mean + \ + np.transpose(np.dot(np.transpose(tmp), B_hat)) + return stand_mean + + +def standardise_data(dat: np.array, stand_mean: np.array, var_pooled: np.array, n_array: int) -> np.array: + """ + standardise the data: substract mean and divide by variance + + Args: + dat (np.array): data matrix + stand_mean (np.array): standardised mean + var_pooled (np.array): Variance for each gene and each batch + n_array (int): total size of dataset + + Returns: + np.array: standardised data matrix + """ + standardised_data = (dat - stand_mean) / \ + np.dot(np.transpose(np.mat(np.sqrt(var_pooled))), np.mat([1]*n_array)) + return standardised_data + + +def fit_model(design: np.array, n_batch: int, standardised_data: np.array, batches: List[list], mean_only: bool, par_prior: bool, precision: float, ref_batch: int, ref: List[int], NAs: bool) -> Tuple[np.array, np.array, np.array]: + """ + Fitting L/S model and finding priors. + + Args: + design (np.array): model matrix for all covariates, including batch + n_batch (int): number of batches + standardised_data (np.array): standardised data matrix + batches (List[list]): list of list of batches' elements + mean_only (bool): True if just adjusting the means and not individual batch effects + par_prior (bool): False for non-parametric estimation of batch effects + precision (float): level of precision for precision computing + ref_batch (int): reference batch selected + ref (List[int]): the corresponding positions of the reference batch in the batch list + NAs (bool): boolean characterising the presence of NaNs in the data matrix + + Returns: + Tuple[np.array, np.array, np.array]: + - estimated additive batch effect + - estimated multiplicative batch effect + - information about batches in design matrix + """ + print("Fitting L/S model and finding priors.") + + # fraction of design matrix related to batches + batch_design = design[0:n_batch] + + if not NAs: # CF SUPRA FOR NAs + # gamma_hat is the vector of additive batch effect + gamma_hat = np.linalg.solve(np.dot(batch_design, np.transpose(batch_design)), + np.dot(batch_design, np.transpose(standardised_data))) + + delta_hat = [] # delta_hat is the vector of estimated multiplicative batch effect + + if (mean_only): + # no variance if mean_only == True + delta_hat = [np.asarray([1]*len(standardised_data))] * len(batches) + else: + for i in batches: # feed incrementally delta_hat + list_map = np.transpose(np.transpose(standardised_data)[i]).var( + axis=1) # variance for each row + delta_hat.append(np.squeeze(np.asarray(list_map))) + + gamma_bar = list(map(np.mean, gamma_hat)) # vector of means for gamma_hat + t2 = list(map(np.var, gamma_hat)) # vector of variances for gamma_hat + + # calculates hyper priors for gamma (additive batch effect) + a_prior = list( + map(partial(compute_prior, 'a', mean_only=mean_only), delta_hat)) + b_prior = list( + map(partial(compute_prior, 'b', mean_only=mean_only), delta_hat)) + + # initialise gamma and delta for parameters estimation + gamma_star = np.empty((n_batch, len(standardised_data))) + delta_star = np.empty((n_batch, len(standardised_data))) + + if par_prior: + # use param_fun function for parametric adjustments (cf. function definition) + print("Finding parametric adjustments.") + results = list(map(partial(param_fun, + standardised_data=standardised_data, + batches=batches, + mean_only=mean_only, + gamma_hat=gamma_hat, + gamma_bar=gamma_bar, + delta_hat=delta_hat, + t2=t2, + a_prior=a_prior, + b_prior=b_prior), range(n_batch))) + else: + # use nonparam_fun for non-parametric adjustments (cf. function definition) + print("Finding nonparametric adjustments") + results = list(map(partial(nonparam_fun, mean_only=mean_only, delta_hat=delta_hat, + standardised_data=standardised_data, batches=batches, gamma_hat=gamma_hat, precision=precision), range(n_batch))) + + for i in range(n_batch): # store the results in gamma/delta_star + results_i = results[i] + gamma_star[i], delta_star[i] = results_i[0], results_i[1] + + # update if reference batch (the reference batch is not supposed to be modified) + if ref_batch: + len_gamma_star_ref = len(gamma_star[ref]) + gamma_star[ref] = [0] * len_gamma_star_ref + delta_star[ref] = [1] * len_gamma_star_ref + + return (gamma_star, delta_star, batch_design) \ No newline at end of file diff --git a/combat/utils/common_utils.py b/combat/utils/common_utils.py new file mode 100644 index 0000000..a90f997 --- /dev/null +++ b/combat/utils/common_utils.py @@ -0,0 +1,41 @@ +import numpy as np + + +def check_all_ones(list_of_elements:list) -> bool: + """ + Check if all elements in a list are 1s + + Args: + list_of_elements (list): list of elements + + Returns: + bool: True if all elements of the list are 1s + """ + return all(list_of_elements == 1) + + +def check_mean_only(mean_only: bool) -> None: + """ + Check mean_only option + + Args: + mean_only (bool): user's choice regarding the mean_only option + """ + if mean_only == True: + print("Using mean only version") + + +def check_NAs(data): + """check if NaNs - in theory, we construct the data without NAs + + Arguments: + data {matrix} -- the data matrix + + Returns: + NAs {bool} -- boolean characterising the presence of NaNs in the data matrix + """ + # NAs = True in (np.isnan(dat)) + NAs = np.isnan(np.sum(data)) # Check if NaN exists + if NAs: + print("Found missing data values. Please remove all missing values before proceeding with pyComBat.") + return NAs \ No newline at end of file From 5cb8641713f10ed5d13c2addcd9438d1ad687e15 Mon Sep 17 00:00:00 2001 From: aryo Date: Mon, 26 Jul 2021 21:12:28 +0200 Subject: [PATCH 03/12] Unit tests for utils and pycombat class --- tests/test_combat_utils.py | 131 +++++++++++++++++++++++++++++++++++++ tests/test_common_utils.py | 29 ++++++++ tests/test_pycombat.py | 72 ++++++++++++++++++++ 3 files changed, 232 insertions(+) create mode 100644 tests/test_combat_utils.py create mode 100644 tests/test_common_utils.py create mode 100644 tests/test_pycombat.py diff --git a/tests/test_combat_utils.py b/tests/test_combat_utils.py new file mode 100644 index 0000000..1cd8506 --- /dev/null +++ b/tests/test_combat_utils.py @@ -0,0 +1,131 @@ +import pytest + +import numpy as np +import pandas as pd + +from combat.utils.combat_utils import model_matrix, compute_prior, postmean, postvar, it_sol, int_eprior, check_ref_batch, treat_batches, treat_covariates, calculate_mean_var, calculate_stand_mean, standardise_data, fit_model +from combat.utils.common_utils import check_NAs + +""" +Load the shared variables +""" +batch = np.asarray([1, 1, 1, 2, 2, 3, 3, 3, 3]) +matrix = np.transpose([np.random.normal(size=1000, loc=3, scale=1), np.random.normal(size=1000, loc=3, scale=1), np.random.normal(size=1000, loc=3, scale=1), + np.random.normal(size=1000, loc=2, scale=0.6), np.random.normal(size=1000, loc=2, scale=0.6), + np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1)]) + +matrix = pd.DataFrame(data=matrix,columns=["sample_"+str(i+1) for i in range(9)],index=["gene_"+str(i+1) for i in range(1000)]) +ref_batch = None +mean_only = False +par_prior = False +precision = None +mod = [] +dat = matrix.values +batchmod = model_matrix(list(batch), intercept=False, drop_first=False) +ref,batchmod = check_ref_batch(ref_batch,batch,batchmod) +n_batch, batches, n_batches, n_array = treat_batches(batch) +design = treat_covariates(batchmod, mod, ref, n_batch) +NAs = check_NAs(dat) +B_hat, grand_mean, var_pooled = calculate_mean_var(design, batches, ref, dat, NAs, ref_batch, n_batches, n_batch, n_array) +stand_mean = calculate_stand_mean(grand_mean, n_array, design, n_batch,B_hat) +s_data = standardise_data(dat, stand_mean, var_pooled, n_array) +gamma_star, delta_star, batch_design = fit_model(design,n_batch,s_data, batches, mean_only, par_prior, precision, ref_batch, ref, NAs) + + +# test for compute_prior +def test_compute_prior(): + print("aprior", compute_prior("a", gamma_star, False)) + assert compute_prior("a", gamma_star, True) == 1 + print("bprior", compute_prior("b", gamma_star, False)) + assert compute_prior("b", gamma_star, True) == 1 + + +# test for model_matrix +def test_model_matrix(): + model_matrix_test = model_matrix([1, 1, 0, 1, 0]) + assert np.shape(model_matrix_test) == (5, 2) + assert list(model_matrix_test[0]) == [1.0, 1.0] + + + +# test for compute_prior +def test_compute_prior(): + print("aprior", compute_prior("a", gamma_star, False)) + assert compute_prior("a", gamma_star, True) == 1 + print("bprior", compute_prior("b", gamma_star, False)) + assert compute_prior("b", gamma_star, True) == 1 + + +# test for postmean +def test_postmean(): + assert np.shape(postmean(gamma_star, delta_star, gamma_star, delta_star)) == np.shape(gamma_star) + + +# test for postvar +def test_postvar(): + assert np.sum(postvar([2, 4, 6], 2, 1, 1) - [2, 3, 4]) == 0 + + +# test for it_sol +def test_it_sol(): + assert 1 == 1 + + +# test for int_eprior +def test_int_eprior(): + assert 1 == 1 + + +# test for model_matrix +def test_model_matrix(): + model_matrix_test = model_matrix([1, 1, 0, 1, 0]) + assert np.shape(model_matrix_test) == (5, 2) + assert list(model_matrix_test[0]) == [1.0, 1.0] + + +# test for check_ref_batch +def test_check_ref_batch(): + assert check_ref_batch(1, batch, batchmod) == (0, batchmod) + assert check_ref_batch(2, batch, batchmod) == (1, batchmod) + print("Using batch 1 then 2. Above lines should inform on that.") + assert check_ref_batch(None, batch, batchmod) == (None, batchmod) + + +# test for treat_batches +def test_treat_batches(): + assert n_batch == 3 + assert batches[0].tolist() == [0, 1, 2] + assert batches[1].tolist() == [3, 4] + assert batches[2].tolist() == [5, 6, 7, 8] + assert n_batches == [3, 2, 4] + assert n_array == 9 + + +# test for treat_covariates +def test_treat_covariates(): + batchmod = model_matrix(list(batch), intercept=False, drop_first=False) + assert np.sum(design - np.transpose(batchmod)) == 0 + + +# test for calculate_mean_var +def test_calculate_mean_var(): + assert np.shape(B_hat)[0] == np.shape(design)[0] + assert np.shape(grand_mean)[0] == np.shape(dat)[0] + assert np.shape(var_pooled)[0] == np.shape(dat)[0] + + +# test for calculate_stand_mean +def test_calculate_stand_mean(): + assert np.shape(stand_mean) == np.shape(dat) + + +# test for standardise_data +def test_standardise_data(): + assert np.shape(s_data) == np.shape(dat) + + +# test for fit_model +def test_fit_model(): + assert np.shape(gamma_star)[1] == np.shape(dat)[0] + assert np.shape(delta_star)[1] == np.shape(dat)[0] + assert np.shape(batch_design) == np.shape(design) diff --git a/tests/test_common_utils.py b/tests/test_common_utils.py new file mode 100644 index 0000000..4cf4eee --- /dev/null +++ b/tests/test_common_utils.py @@ -0,0 +1,29 @@ +import pytest + +import numpy as np +import pandas as pd + +from combat.utils.common_utils import check_all_ones, check_mean_only, check_NAs + + +# tests for check_all_ones function +def test_check_all_ones(): + assert check_all_ones(np.array([1, 1, 1, 1, 1])) == True + + assert check_all_ones(np.array([1, 1, 1, 1, 0])) == False + assert check_all_ones(np.array([0, 0, 0, 0, 0])) == False + + assert check_all_ones(np.array([1.5, 0.5, 1, 1, 1])) == False # This test to show the limit of the method we use + + +# test for check_mean_only +def test_check_mean_only(): + check_mean_only(True) + check_mean_only(False) + print("Only one line of text should have been printed above.") + + +# test for check_NAs +def test_check_NAs(): + assert check_NAs([0,1,2]) == False + assert check_NAs([0,np.nan,2]) == True \ No newline at end of file diff --git a/tests/test_pycombat.py b/tests/test_pycombat.py new file mode 100644 index 0000000..1cf6981 --- /dev/null +++ b/tests/test_pycombat.py @@ -0,0 +1,72 @@ +#----------------------------------------------------------------------------- +# Copyright (C) 2019-2020 A. Behdenna, A. Nordor, J. Haziza and A. Gema + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# For more information, please contact Abdelkader Behdenna / + +# file pycombat.py +# author A. Behdenna, J. Haziza, A. Gema, A. Nordor +# date Sept 2020 +#----------------------------------------------------------------------------- + + + + +# this file is only used for unit testing +# We import the function that will be tested one by one, incrementally +# Generates a report about the functions tested + +import numpy as np +import pandas as pd + +########## +# import function used for unit testing +from combat import PyCombat + +########## +print("\n#### Unit Testing for pyComBat ####\n") + +########## +# Define constants for unit testing + +batch = np.asarray([1, 1, 1, 2, 2, 3, 3, 3, 3]) +matrix = np.transpose([np.random.normal(size=1000, loc=3, scale=1), np.random.normal(size=1000, loc=3, scale=1), np.random.normal(size=1000, loc=3, scale=1), + np.random.normal(size=1000, loc=2, scale=0.6), np.random.normal(size=1000, loc=2, scale=0.6), + np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1)]) + +matrix = pd.DataFrame(data=matrix,columns=["sample_"+str(i+1) for i in range(9)],index=["gene_"+str(i+1) for i in range(1000)]) + +print("Matrix and batch generated.") + +matrix_adjusted = PyCombat().fit_transform(matrix, batch) + +print("Adjusted matrix generated.") + +########## +# local tests before unit testing + +def test_means(): + print(f"mean matrix: {np.mean(matrix)}") + print(f"mean matrix (adjusted): {np.mean(matrix_adjusted)}") + print("**********") + print(f"var matrix: {np.var(matrix)}") + print(f"var matrix (adjusted): {np.var(matrix_adjusted)}") + + +# general tests on pyComBat +def test_pycombat(): + assert np.shape(matrix) == np.shape(matrix_adjusted) + assert abs(np.mean(matrix.values)-np.mean(matrix_adjusted.values)) <= np.mean(matrix.values)*0.05 + assert np.var(matrix_adjusted.values) <= np.var(matrix.values) From 500cfaaf78fcc12dd8a01c8c1fdada521dc6880e Mon Sep 17 00:00:00 2001 From: aryo Date: Mon, 26 Jul 2021 21:16:15 +0200 Subject: [PATCH 04/12] Stylized by Black --- combat/__init__.py | 2 +- combat/pycombat.py | 120 ++++++++++--- combat/utils/__init__.py | 2 +- combat/utils/combat_utils.py | 327 ++++++++++++++++++++++++----------- combat/utils/common_utils.py | 10 +- docs/source/conf.py | 56 +++--- tests/test_combat_utils.py | 70 ++++++-- tests/test_common_utils.py | 8 +- tests/test_pycombat.py | 36 ++-- 9 files changed, 446 insertions(+), 185 deletions(-) diff --git a/combat/__init__.py b/combat/__init__.py index d1d2ad9..560be3d 100644 --- a/combat/__init__.py +++ b/combat/__init__.py @@ -1 +1 @@ -from .pycombat import PyCombat \ No newline at end of file +from .pycombat import PyCombat diff --git a/combat/pycombat.py b/combat/pycombat.py index cc12bd6..005969c 100644 --- a/combat/pycombat.py +++ b/combat/pycombat.py @@ -1,4 +1,4 @@ -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Copyright (C) 2019-2020 A. Behdenna, A. Nordor, J. Haziza and A. Gema # This program is free software: you can redistribute it and/or modify @@ -19,7 +19,7 @@ # file pycombat.py # author A. Behdenna, J. Haziza, A. Gema, A. Nordor # date Sept 2020 -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- from __future__ import annotations @@ -68,17 +68,43 @@ def __init__(self): def check_if_fitted(self): error_message = "is not initialized, run .fit_transform() or .fit()" - - assert self.gamma_star is not None, f"gamma_star (estimated additive batch effect) {error_message}" - assert self.delta_star is not None, f"delta_star (estimated multiplicative batch effect) {error_message}" - assert self.batch_design is not None, f"batch_design (information about batches in design matrix) {error_message}" - assert self.n_batches is not None, f"n_batches (list of batches lengths ) {error_message}" - assert self.var_pooled is not None, f"var_pooled (standardised mean) {error_message}" - assert self.stand_mean is not None, f"stand_mean (Variance for each gene and each batch) {error_message}" - assert self.n_array is not None, f"n_array (total size of dataset) {error_message}" - assert self.batches is not None, f"batches (list of unique batches) {error_message}" - - def fit(self, data: pd.DataFrame, batch: list, mod: list = [], par_prior: bool = True, prior_plots: bool = False, mean_only: bool = False, ref_batch: int = None, precision: float = None) -> PyCombat: + + assert ( + self.gamma_star is not None + ), f"gamma_star (estimated additive batch effect) {error_message}" + assert ( + self.delta_star is not None + ), f"delta_star (estimated multiplicative batch effect) {error_message}" + assert ( + self.batch_design is not None + ), f"batch_design (information about batches in design matrix) {error_message}" + assert ( + self.n_batches is not None + ), f"n_batches (list of batches lengths ) {error_message}" + assert ( + self.var_pooled is not None + ), f"var_pooled (standardised mean) {error_message}" + assert ( + self.stand_mean is not None + ), f"stand_mean (Variance for each gene and each batch) {error_message}" + assert ( + self.n_array is not None + ), f"n_array (total size of dataset) {error_message}" + assert ( + self.batches is not None + ), f"batches (list of unique batches) {error_message}" + + def fit( + self, + data: pd.DataFrame, + batch: list, + mod: list = [], + par_prior: bool = True, + prior_plots: bool = False, + mean_only: bool = False, + ref_batch: int = None, + precision: float = None, + ) -> PyCombat: """ Initialized the important variables for the batch effect correction @@ -101,34 +127,59 @@ def fit(self, data: pd.DataFrame, batch: list, mod: list = [], par_prior: bool = self.list_samples = data.columns self.list_genes = data.index self.data_value = data.values - + self.ref_batch = ref_batch check_mean_only(mean_only) # Generate model matrix batchmod = model_matrix(list(batch), intercept=False, drop_first=False) - + # self.ref (List[int]): the corresponding positions of the reference batch in the batch list self.ref, batchmod = check_ref_batch(self.ref_batch, batch, batchmod) - + # self.batches (List[int]): list of unique batches # self.n_batches (List[int]): list of batches lengths # self.n_array (int): total size of dataset n_batch, self.batches, self.n_batches, self.n_array = treat_batches(batch) - + design = treat_covariates(batchmod, mod, self.ref, n_batch) - + NAs = check_NAs(self.data_value) if not NAs: - B_hat, grand_mean, self.var_pooled = calculate_mean_var(design, self.batches, self.ref, self.data_value, NAs, self.ref_batch, self.n_batches, n_batch, self.n_array) - self.stand_mean = calculate_stand_mean(grand_mean, self.n_array, design, n_batch, B_hat) - self.standardised_data = standardise_data(self.data_value, self.stand_mean, self.var_pooled, self.n_array) + B_hat, grand_mean, self.var_pooled = calculate_mean_var( + design, + self.batches, + self.ref, + self.data_value, + NAs, + self.ref_batch, + self.n_batches, + n_batch, + self.n_array, + ) + self.stand_mean = calculate_stand_mean( + grand_mean, self.n_array, design, n_batch, B_hat + ) + self.standardised_data = standardise_data( + self.data_value, self.stand_mean, self.var_pooled, self.n_array + ) # self.gamma_star (np.array): estimated additive batch effect # self.delta_star (np.array): estimated multiplicative batch effect # self.batch_design (np.array): information about batches in design matrix - self.gamma_star, self.delta_star, self.batch_design = fit_model(design, n_batch, self.standardised_data, self.batches, mean_only, par_prior, precision, self.ref_batch, self.ref, NAs) + self.gamma_star, self.delta_star, self.batch_design = fit_model( + design, + n_batch, + self.standardised_data, + self.batches, + mean_only, + par_prior, + precision, + self.ref_batch, + self.ref, + NAs, + ) else: raise ValueError("NaN value is not accepted") @@ -149,17 +200,34 @@ def transform(self, data: pd.DataFrame) -> pd.DataFrame: print("Adjusting the Data") bayes_data = np.transpose(self.standardised_data) - for i, batch_index in enumerate(self.batches): # for each batch, specific correction - bayes_data[batch_index] = (bayes_data[batch_index] - np.dot(np.transpose(self.batch_design)[batch_index], self.gamma_star)) / np.transpose(np.outer(np.sqrt(self.delta_star[i]), np.asarray([1]*self.n_batches[i]))) + for i, batch_index in enumerate( + self.batches + ): # for each batch, specific correction + bayes_data[batch_index] = ( + bayes_data[batch_index] + - np.dot(np.transpose(self.batch_design)[batch_index], self.gamma_star) + ) / np.transpose( + np.outer( + np.sqrt(self.delta_star[i]), np.asarray([1] * self.n_batches[i]) + ) + ) # renormalise the data after correction: # 1. multiply by variance # 2. add mean - bayes_data = np.multiply(np.transpose(bayes_data), np.outer(np.sqrt(self.var_pooled), np.asarray([1]*self.n_array))) + self.stand_mean + bayes_data = ( + np.multiply( + np.transpose(bayes_data), + np.outer(np.sqrt(self.var_pooled), np.asarray([1] * self.n_array)), + ) + + self.stand_mean + ) # correction for reference batch if self.ref_batch: bayes_data[self.batches[self.ref]] = self.data_value[self.batches[self.ref]] # returns the data corrected for batch effects - return pd.DataFrame(bayes_data, columns=self.list_samples, index=self.list_genes) + return pd.DataFrame( + bayes_data, columns=self.list_samples, index=self.list_genes + ) diff --git a/combat/utils/__init__.py b/combat/utils/__init__.py index d6014f4..0ee28e5 100644 --- a/combat/utils/__init__.py +++ b/combat/utils/__init__.py @@ -1,2 +1,2 @@ from .combat_utils import * -from .common_utils import * \ No newline at end of file +from .common_utils import * diff --git a/combat/utils/combat_utils.py b/combat/utils/combat_utils.py index 18b25a2..6dc6ea6 100644 --- a/combat/utils/combat_utils.py +++ b/combat/utils/combat_utils.py @@ -1,7 +1,4 @@ -from typing_extensions import ( - Literal, - Tuple -) +from typing_extensions import Literal, Tuple from typing import List, Optional import numpy as np @@ -12,7 +9,9 @@ from .common_utils import check_all_ones -def model_matrix(info: list, intercept: bool = True, drop_first: bool = True) -> np.array: +def model_matrix( + info: list, intercept: bool = True, drop_first: bool = True +) -> np.array: """ Creates the model_matrix from batch list @@ -24,20 +23,22 @@ def model_matrix(info: list, intercept: bool = True, drop_first: bool = True) -> Returns: np.array: Model matrix generate from batch list """ - if not isinstance(info[0],list) : + if not isinstance(info[0], list): info = [info] else: info = info info_dict = {} for i in range(len(info)): - info_dict[f"col{str(i)}"] = list(map(str,info[i])) + info_dict[f"col{str(i)}"] = list(map(str, info[i])) df = pd.get_dummies(pd.DataFrame(info_dict), drop_first=drop_first, dtype=float) if intercept: df["intercept"] = 1.0 return df.to_numpy() -def compute_prior(prior: Literal["a", "b"], gamma_hat: np.array, mean_only: bool) -> float: +def compute_prior( + prior: Literal["a", "b"], gamma_hat: np.array, mean_only: bool +) -> float: """ aprior and bprior are useful to compute "hyper-prior values" -> prior parameters used to estimate the prior gamma distribution for multiplicative batch effect @@ -65,15 +66,17 @@ def compute_prior(prior: Literal["a", "b"], gamma_hat: np.array, mean_only: bool m = np.mean(gamma_hat) s2 = np.var(gamma_hat) - if prior == 'a': - calculated_prior = (2*s2+m*m)/s2 - elif prior == 'b': - calculated_prior = (m*s2+m*m*m)/s2 + if prior == "a": + calculated_prior = (2 * s2 + m * m) / s2 + elif prior == "b": + calculated_prior = (m * s2 + m * m * m) / s2 return calculated_prior -def postmean(g_bar: np.array, d_star: np.array, t2_n: np.array, t2_n_g_hat: np.array) -> np.array: +def postmean( + g_bar: np.array, d_star: np.array, t2_n: np.array, t2_n_g_hat: np.array +) -> np.array: """ Estimates additive batch effect @@ -103,10 +106,20 @@ def postvar(sum2: np.array, n: float, a: float, b: float) -> np.array: Returns: np.array: matrix of estimated multiplicative batch effect """ - return(np.divide((np.multiply(0.5, sum2)+b), (np.multiply(0.5, n)+a-1))) - - -def it_sol(sdat: np.array, g_hat: np.array, d_hat: np.array, g_bar: np.array, t2: np.array, a: float, b: float, conv: float = 0.0001, exit_iteration: float = 10e5) -> np.array: + return np.divide((np.multiply(0.5, sum2) + b), (np.multiply(0.5, n) + a - 1)) + + +def it_sol( + sdat: np.array, + g_hat: np.array, + d_hat: np.array, + g_bar: np.array, + t2: np.array, + a: float, + b: float, + conv: float = 0.0001, + exit_iteration: float = 10e5, +) -> np.array: """ Iterative solution for Empirical Bayesian method @@ -132,24 +145,36 @@ def it_sol(sdat: np.array, g_hat: np.array, d_hat: np.array, g_bar: np.array, t2 d_old = np.ndarray.copy(d_hat) change = 1 count = 0 # number of steps needed (for diagnostic only) - + # convergence criteria, if new-old < conv, then stop while (change > conv) and (count < exit_iteration): - g_new = postmean(g_bar, d_old, t2_n, t2_n_g_hat) # updated additive batch effect - sum2 = np.sum(np.asarray(np.square( - sdat-np.outer(g_new[0][0], np.ones(np.ma.size(sdat, axis=1))))), axis=1) + g_new = postmean( + g_bar, d_old, t2_n, t2_n_g_hat + ) # updated additive batch effect + sum2 = np.sum( + np.asarray( + np.square( + sdat - np.outer(g_new[0][0], np.ones(np.ma.size(sdat, axis=1))) + ) + ), + axis=1, + ) d_new = postvar(sum2, n, a, b) # updated multiplicative batch effect - change = max(np.amax(np.absolute(g_new-np.asarray(g_old))/np.asarray(g_old)), np.amax( - np.absolute(d_new-d_old)/d_old)) # maximum difference between new and old estimate + change = max( + np.amax(np.absolute(g_new - np.asarray(g_old)) / np.asarray(g_old)), + np.amax(np.absolute(d_new - d_old) / d_old), + ) # maximum difference between new and old estimate g_old = np.ndarray.copy(g_new) # save value for g d_old = np.ndarray.copy(d_new) # save value for d count += 1 adjust = np.asarray([g_new, d_new]) - + return adjust -def int_eprior(sdat: np.array, g_hat: np.array, d_hat: np.array, precision: float) -> np.array: +def int_eprior( + sdat: np.array, g_hat: np.array, d_hat: np.array, precision: float +) -> np.array: """ int_eprior - Monte Carlo integration function to find nonparametric adjustments Johnson et al (Biostatistics 2007, supp.mat.) show that we can estimate the multiplicative and additive batch effects with an integral @@ -176,27 +201,31 @@ def int_eprior(sdat: np.array, g_hat: np.array, d_hat: np.array, precision: floa d = np.asarray(np.delete(np.transpose(d_hat), i)) x = np.asarray(np.transpose(sdat[i])) n = len(x) - j = [1]*n + j = [1] * n dat = np.repeat(x, len(np.transpose(g)), axis=1) - resid2 = np.square(dat-g) + resid2 = np.square(dat - g) sum2 = np.dot(np.transpose(resid2), j) - + # /begin{handling high precision computing} - temp_2d = 2*d - if (precision == None): - LH = np.power(1 / (np.pi * temp_2d), n / 2) * np.exp(np.negative(sum2) / (temp_2d)) + temp_2d = 2 * d + if precision == None: + LH = np.power(1 / (np.pi * temp_2d), n / 2) * np.exp( + np.negative(sum2) / (temp_2d) + ) else: # only if precision parameter informed # increase the precision of the computing (if negative exponential too close to 0) mp.dps = precision - buf_exp = list(map(mp.exp, np.negative(sum2)/(temp_2d))) - buf_pow = list(map(partial(mp.power, y=n/2), 1/(np.pi*temp_2d))) + buf_exp = list(map(mp.exp, np.negative(sum2) / (temp_2d))) + buf_pow = list(map(partial(mp.power, y=n / 2), 1 / (np.pi * temp_2d))) LH = buf_pow * buf_exp # likelihood # /end{handling high precision computing} LH = np.nan_to_num(LH) # corrects NaNs in likelihood if np.sum(LH) == 0 and test_approximation == 0: # correction for LH full of 0.0 test_approximation = 1 # this message won't appear again - print("###\nValues too small, approximation applied to avoid division by 0.\nPrecision mode can correct this problem, but increases computation time.\n###") + print( + "###\nValues too small, approximation applied to avoid division by 0.\nPrecision mode can correct this problem, but increases computation time.\n###" + ) LH[LH == 0] = np.exp(-745) g_star.append(np.sum(g * LH) / np.sum(LH)) d_star.append(np.sum(d * LH) / np.sum(LH)) @@ -204,11 +233,22 @@ def int_eprior(sdat: np.array, g_hat: np.array, d_hat: np.array, precision: floa g_star.append(np.sum(g * LH) / np.sum(LH)) d_star.append(np.sum(d * LH) / np.sum(LH)) adjust = np.asarray([np.asarray(g_star), np.asarray(d_star)]) - + return adjust -def param_fun(i: int, standardised_data: np.array, batches: List[list], mean_only: bool, gamma_hat: np.array, gamma_bar: np.array, delta_hat: np.array, t2: np.array, a_prior: float, b_prior: float) -> Tuple[np.array, np.array]: +def param_fun( + i: int, + standardised_data: np.array, + batches: List[list], + mean_only: bool, + gamma_hat: np.array, + gamma_bar: np.array, + delta_hat: np.array, + t2: np.array, + a_prior: float, + b_prior: float, +) -> Tuple[np.array, np.array]: """ Parametric estimation of batch effects @@ -227,21 +267,40 @@ def param_fun(i: int, standardised_data: np.array, batches: List[list], mean_onl Returns: Tuple[np.array, np.array]: estimated adjusted additive (gamma_star) and multiplicative (delta_star) batch effect """ - if mean_only: # if mean_only, no need for complex method: batch effect is immediately calculated + if ( + mean_only + ): # if mean_only, no need for complex method: batch effect is immediately calculated t2_n = np.multiply(t2[i], 1) t2_n_g_hat = np.multiply(t2_n, gamma_hat[i]) - gamma_star = postmean(gamma_bar[i], 1, t2_n, t2_n_g_hat) # additive batch effect - delta_star = [1]*len(standardised_data) # multiplicative batch effect + gamma_star = postmean( + gamma_bar[i], 1, t2_n, t2_n_g_hat + ) # additive batch effect + delta_star = [1] * len(standardised_data) # multiplicative batch effect else: # if not(mean_only) then use it_solve - temp = it_sol(np.transpose(np.transpose(standardised_data)[ - batches[i]]), gamma_hat[i], delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i]) + temp = it_sol( + np.transpose(np.transpose(standardised_data)[batches[i]]), + gamma_hat[i], + delta_hat[i], + gamma_bar[i], + t2[i], + a_prior[i], + b_prior[i], + ) gamma_star = temp[0] # additive batch effect delta_star = temp[1] # multiplicative batch effect return (gamma_star, delta_star) # FIXME: Params order is not the same -def nonparam_fun(i: int, mean_only: bool, delta_hat: np.array, standardised_data: np.array, batches: List[list], gamma_hat: np.array, precision: float) -> Tuple[np.array, np.array]: +def nonparam_fun( + i: int, + mean_only: bool, + delta_hat: np.array, + standardised_data: np.array, + batches: List[list], + gamma_hat: np.array, + precision: float, +) -> Tuple[np.array, np.array]: """ Non-parametric estimation @@ -258,14 +317,20 @@ def nonparam_fun(i: int, mean_only: bool, delta_hat: np.array, standardised_data Tuple[np.array, np.array]: estimated adjusted additive and multiplicative batch effect """ if mean_only: # if mean only, change delta_hat to vector of 1s - delta_hat[i] = [1]*len(delta_hat[i]) + delta_hat[i] = [1] * len(delta_hat[i]) # use int_eprior for non-parametric estimation - temp = int_eprior(np.transpose(np.transpose(standardised_data)[ - batches[i]]), gamma_hat[i], delta_hat[i], precision) + temp = int_eprior( + np.transpose(np.transpose(standardised_data)[batches[i]]), + gamma_hat[i], + delta_hat[i], + precision, + ) return (temp[0], temp[1]) -def check_ref_batch(ref_batch: int, batch: List[int], batchmod: np.array) -> Tuple[Optional[List[int]], np.array]: +def check_ref_batch( + ref_batch: int, batch: List[int], batchmod: np.array +) -> Tuple[Optional[List[int]], np.array]: """ check ref_batch option and treat it if needed @@ -278,13 +343,15 @@ def check_ref_batch(ref_batch: int, batch: List[int], batchmod: np.array) -> Tup ValueError: Reference level ref_batch must be one of the levels of batch Returns: - Tuple[Optional[List[int]], np.array]: + Tuple[Optional[List[int]], np.array]: - the corresponding positions of the reference batch in the batch list - updated model matrix related to batches, with reference """ if ref_batch is not None: if ref_batch not in batch: - raise ValueError("Reference level ref_batch must be one of the levels of batch.") + raise ValueError( + "Reference level ref_batch must be one of the levels of batch." + ) print(f"Using batch {str(ref_batch)} as a reference batch.") # ref keeps in memory the columns concerned by the reference batch @@ -297,7 +364,9 @@ def check_ref_batch(ref_batch: int, batch: List[int], batchmod: np.array) -> Tup return (ref, batchmod) -def treat_covariates(batchmod: np.array, mod: np.array, ref: int, n_batch: int) -> np.array: +def treat_covariates( + batchmod: np.array, mod: np.array, ref: int, n_batch: int +) -> np.array: """ treat covariates @@ -327,17 +396,27 @@ def treat_covariates(batchmod: np.array, mod: np.array, ref: int, n_batch: int) design = design[:, ~np.array(check)] design = np.transpose(design) - print(f"Adjusting for {str(len(design)-len(np.transpose(batchmod)))} covariate(s) or covariate level(s).") + print( + f"Adjusting for {str(len(design)-len(np.transpose(batchmod)))} covariate(s) or covariate level(s)." + ) # if matrix cannot be invertible, different cases if np.linalg.matrix_rank(design) < len(design): if len(design) == n_batch + 1: # case 1: covariate confunded with a batch - raise ValueError("The covariate is confunded with batch. Remove the covariate and rerun pyComBat.") - if len(design) > n_batch + 1: # case 2: multiple covariates confunded with a batch + raise ValueError( + "The covariate is confunded with batch. Remove the covariate and rerun pyComBat." + ) + if ( + len(design) > n_batch + 1 + ): # case 2: multiple covariates confunded with a batch if np.linalg.matrix_rank(np.transpose(design)[:n_batch]) < len(design): - raise ValueError("The covariates are confounded! Please remove one or more of the covariates so the design is not confounded.") + raise ValueError( + "The covariates are confounded! Please remove one or more of the covariates so the design is not confounded." + ) else: # case 3: at least a covariate confunded with a batch - raise ValueError("At least one covariate is confounded with batch. Please remove confounded covariates and rerun pyComBat") + raise ValueError( + "At least one covariate is confounded with batch. Please remove confounded covariates and rerun pyComBat" + ) return design @@ -352,7 +431,7 @@ def treat_batches(batch: list) -> Tuple[int, List[int], List[int], int]: Tuple[int, List[int], List[int], int]: - number of batches - list of unique batches - - list of batches lengths + - list of batches lengths - total size of dataset """ n_batch = len(np.unique(batch)) # number of batches @@ -364,13 +443,23 @@ def treat_batches(batch: list) -> Tuple[int, List[int], List[int], int]: batches = np.asarray(batches) n_batches = list(map(len, batches)) if 1 in n_batches: - #mean_only = True # no variance if only one sample in a batch - mean_only has to be used + # mean_only = True # no variance if only one sample in a batch - mean_only has to be used print("\nOne batch has only one sample, try setting mean_only=True.\n") n_array = sum(n_batches) return (n_batch, batches, n_batches, n_array) -def calculate_mean_var(design: np.array, batches: List[int], ref: int, dat: np.array, NAs: bool, ref_batch: int, n_batches: List[int], n_batch: int, n_array: int) -> Tuple[np.array, np.array, np.array]: +def calculate_mean_var( + design: np.array, + batches: List[int], + ref: int, + dat: np.array, + NAs: bool, + ref_batch: int, + n_batches: List[int], + n_batch: int, + n_array: int, +) -> Tuple[np.array, np.array, np.array]: """ calculates the Normalisation factors @@ -392,32 +481,43 @@ def calculate_mean_var(design: np.array, batches: List[int], ref: int, dat: np.a - Variance for each gene and each batch """ print("Standardizing Data across genes.") - if not(NAs): # NAs not supported + if not (NAs): # NAs not supported # B_hat is the vector of regression coefficients corresponding to the design matrix - B_hat = np.linalg.solve(np.dot(design, np.transpose( - design)), np.dot(design, np.transpose(dat))) + B_hat = np.linalg.solve( + np.dot(design, np.transpose(design)), np.dot(design, np.transpose(dat)) + ) # Calculates the general mean if ref_batch is not None: grand_mean = np.transpose(B_hat[ref]) else: - grand_mean = np.dot(np.transpose( - [i / n_array for i in n_batches]), B_hat[0:n_batch]) - + grand_mean = np.dot( + np.transpose([i / n_array for i in n_batches]), B_hat[0:n_batch] + ) + # Calculates the general variance if not NAs: # NAs not supported if ref_batch is not None: # depending on ref batch ref_dat = np.transpose(np.transpose(dat)[batches[ref]]) - var_pooled = np.dot(np.square(ref_dat - np.transpose(np.dot(np.transpose( - design)[batches[ref]], B_hat))), [1/n_batches[ref]]*n_batches[ref]) + var_pooled = np.dot( + np.square( + ref_dat + - np.transpose(np.dot(np.transpose(design)[batches[ref]], B_hat)) + ), + [1 / n_batches[ref]] * n_batches[ref], + ) else: - var_pooled = np.dot(np.square( - dat - np.transpose(np.dot(np.transpose(design), B_hat))), [1/n_array]*n_array) + var_pooled = np.dot( + np.square(dat - np.transpose(np.dot(np.transpose(design), B_hat))), + [1 / n_array] * n_array, + ) return (B_hat, grand_mean, var_pooled) -def calculate_stand_mean(grand_mean: np.array, n_array: int, design: np.array, n_batch: int, B_hat: np.array) -> np.array: +def calculate_stand_mean( + grand_mean: np.array, n_array: int, design: np.array, n_batch: int, B_hat: np.array +) -> np.array: """ transform the format of the mean for substraction @@ -431,17 +531,18 @@ def calculate_stand_mean(grand_mean: np.array, n_array: int, design: np.array, n Returns: np.array: standardised mean """ - stand_mean = np.dot(np.transpose(np.mat(grand_mean)), np.mat([1]*n_array)) + stand_mean = np.dot(np.transpose(np.mat(grand_mean)), np.mat([1] * n_array)) # corrects the mean with design matrix information if design is not None: tmp = np.ndarray.copy(design) tmp[0:n_batch] = 0 - stand_mean = stand_mean + \ - np.transpose(np.dot(np.transpose(tmp), B_hat)) + stand_mean = stand_mean + np.transpose(np.dot(np.transpose(tmp), B_hat)) return stand_mean -def standardise_data(dat: np.array, stand_mean: np.array, var_pooled: np.array, n_array: int) -> np.array: +def standardise_data( + dat: np.array, stand_mean: np.array, var_pooled: np.array, n_array: int +) -> np.array: """ standardise the data: substract mean and divide by variance @@ -454,12 +555,24 @@ def standardise_data(dat: np.array, stand_mean: np.array, var_pooled: np.array, Returns: np.array: standardised data matrix """ - standardised_data = (dat - stand_mean) / \ - np.dot(np.transpose(np.mat(np.sqrt(var_pooled))), np.mat([1]*n_array)) + standardised_data = (dat - stand_mean) / np.dot( + np.transpose(np.mat(np.sqrt(var_pooled))), np.mat([1] * n_array) + ) return standardised_data -def fit_model(design: np.array, n_batch: int, standardised_data: np.array, batches: List[list], mean_only: bool, par_prior: bool, precision: float, ref_batch: int, ref: List[int], NAs: bool) -> Tuple[np.array, np.array, np.array]: +def fit_model( + design: np.array, + n_batch: int, + standardised_data: np.array, + batches: List[list], + mean_only: bool, + par_prior: bool, + precision: float, + ref_batch: int, + ref: List[int], + NAs: bool, +) -> Tuple[np.array, np.array, np.array]: """ Fitting L/S model and finding priors. @@ -488,28 +601,29 @@ def fit_model(design: np.array, n_batch: int, standardised_data: np.array, batch if not NAs: # CF SUPRA FOR NAs # gamma_hat is the vector of additive batch effect - gamma_hat = np.linalg.solve(np.dot(batch_design, np.transpose(batch_design)), - np.dot(batch_design, np.transpose(standardised_data))) + gamma_hat = np.linalg.solve( + np.dot(batch_design, np.transpose(batch_design)), + np.dot(batch_design, np.transpose(standardised_data)), + ) delta_hat = [] # delta_hat is the vector of estimated multiplicative batch effect - if (mean_only): + if mean_only: # no variance if mean_only == True - delta_hat = [np.asarray([1]*len(standardised_data))] * len(batches) + delta_hat = [np.asarray([1] * len(standardised_data))] * len(batches) else: for i in batches: # feed incrementally delta_hat list_map = np.transpose(np.transpose(standardised_data)[i]).var( - axis=1) # variance for each row + axis=1 + ) # variance for each row delta_hat.append(np.squeeze(np.asarray(list_map))) gamma_bar = list(map(np.mean, gamma_hat)) # vector of means for gamma_hat t2 = list(map(np.var, gamma_hat)) # vector of variances for gamma_hat # calculates hyper priors for gamma (additive batch effect) - a_prior = list( - map(partial(compute_prior, 'a', mean_only=mean_only), delta_hat)) - b_prior = list( - map(partial(compute_prior, 'b', mean_only=mean_only), delta_hat)) + a_prior = list(map(partial(compute_prior, "a", mean_only=mean_only), delta_hat)) + b_prior = list(map(partial(compute_prior, "b", mean_only=mean_only), delta_hat)) # initialise gamma and delta for parameters estimation gamma_star = np.empty((n_batch, len(standardised_data))) @@ -518,21 +632,40 @@ def fit_model(design: np.array, n_batch: int, standardised_data: np.array, batch if par_prior: # use param_fun function for parametric adjustments (cf. function definition) print("Finding parametric adjustments.") - results = list(map(partial(param_fun, - standardised_data=standardised_data, - batches=batches, - mean_only=mean_only, - gamma_hat=gamma_hat, - gamma_bar=gamma_bar, - delta_hat=delta_hat, - t2=t2, - a_prior=a_prior, - b_prior=b_prior), range(n_batch))) + results = list( + map( + partial( + param_fun, + standardised_data=standardised_data, + batches=batches, + mean_only=mean_only, + gamma_hat=gamma_hat, + gamma_bar=gamma_bar, + delta_hat=delta_hat, + t2=t2, + a_prior=a_prior, + b_prior=b_prior, + ), + range(n_batch), + ) + ) else: # use nonparam_fun for non-parametric adjustments (cf. function definition) print("Finding nonparametric adjustments") - results = list(map(partial(nonparam_fun, mean_only=mean_only, delta_hat=delta_hat, - standardised_data=standardised_data, batches=batches, gamma_hat=gamma_hat, precision=precision), range(n_batch))) + results = list( + map( + partial( + nonparam_fun, + mean_only=mean_only, + delta_hat=delta_hat, + standardised_data=standardised_data, + batches=batches, + gamma_hat=gamma_hat, + precision=precision, + ), + range(n_batch), + ) + ) for i in range(n_batch): # store the results in gamma/delta_star results_i = results[i] @@ -544,4 +677,4 @@ def fit_model(design: np.array, n_batch: int, standardised_data: np.array, batch gamma_star[ref] = [0] * len_gamma_star_ref delta_star[ref] = [1] * len_gamma_star_ref - return (gamma_star, delta_star, batch_design) \ No newline at end of file + return (gamma_star, delta_star, batch_design) diff --git a/combat/utils/common_utils.py b/combat/utils/common_utils.py index a90f997..d1905fc 100644 --- a/combat/utils/common_utils.py +++ b/combat/utils/common_utils.py @@ -1,7 +1,7 @@ import numpy as np -def check_all_ones(list_of_elements:list) -> bool: +def check_all_ones(list_of_elements: list) -> bool: """ Check if all elements in a list are 1s @@ -17,7 +17,7 @@ def check_all_ones(list_of_elements:list) -> bool: def check_mean_only(mean_only: bool) -> None: """ Check mean_only option - + Args: mean_only (bool): user's choice regarding the mean_only option """ @@ -37,5 +37,7 @@ def check_NAs(data): # NAs = True in (np.isnan(dat)) NAs = np.isnan(np.sum(data)) # Check if NaN exists if NAs: - print("Found missing data values. Please remove all missing values before proceeding with pyComBat.") - return NAs \ No newline at end of file + print( + "Found missing data values. Please remove all missing values before proceeding with pyComBat." + ) + return NAs diff --git a/docs/source/conf.py b/docs/source/conf.py index 7fe46f6..324c180 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,20 +12,21 @@ # import os import sys -sys.path.insert(0, os.path.abspath('../../ComBat')) + +sys.path.insert(0, os.path.abspath("../../ComBat")) sys.setrecursionlimit(1500) # -- Project information ----------------------------------------------------- -project = 'pyComBat' -copyright = '2020, Abdelkader Behdenna' -author = 'Abdelkader Behdenna' +project = "pyComBat" +copyright = "2020, Abdelkader Behdenna" +author = "Abdelkader Behdenna" # The short version -version = '' +version = "" # The full version, including alpha/beta/rc tags -release = '0.4.4' +release = "0.4.4" # -- General configuration --------------------------------------------------- @@ -39,26 +40,27 @@ # ones. import sphinx_rtd_theme -extensions = ['sphinx.ext.autodoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', - 'rinoh.frontend.sphinx', +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", + "rinoh.frontend.sphinx", "sphinx_rtd_theme", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -66,48 +68,44 @@ exclude_patterns = [] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -#html_theme = 'alabaster' +# html_theme = 'alabaster' html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # -- LaTeX elements ---------------------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # - 'papersize': 'letterpaper', - + "papersize": "letterpaper", # The font size ('10pt', '11pt' or '12pt'). # - 'pointsize': '10pt', - + "pointsize": "10pt", # Additional stuff for the LaTeX preamble. # - 'preamble': '', - + "preamble": "", # Latex figure (float) alignment # - 'figure_align': 'htbp', - 'extraclassoptions': 'openany,oneside', + "figure_align": "htbp", + "extraclassoptions": "openany,oneside", } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'pyComBat.tex', 'pyComBat', - 'Abdelkader Behdenna', 'manual'), + (master_doc, "pyComBat.tex", "pyComBat", "Abdelkader Behdenna", "manual"), ] # -- Extension configuration ------------------------------------------------- @@ -115,4 +113,4 @@ # -- Options for intersphinx extension --------------------------------------- # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'https://docs.python.org/': None} \ No newline at end of file +intersphinx_mapping = {"https://docs.python.org/": None} diff --git a/tests/test_combat_utils.py b/tests/test_combat_utils.py index 1cd8506..561ce19 100644 --- a/tests/test_combat_utils.py +++ b/tests/test_combat_utils.py @@ -3,18 +3,46 @@ import numpy as np import pandas as pd -from combat.utils.combat_utils import model_matrix, compute_prior, postmean, postvar, it_sol, int_eprior, check_ref_batch, treat_batches, treat_covariates, calculate_mean_var, calculate_stand_mean, standardise_data, fit_model +from combat.utils.combat_utils import ( + model_matrix, + compute_prior, + postmean, + postvar, + it_sol, + int_eprior, + check_ref_batch, + treat_batches, + treat_covariates, + calculate_mean_var, + calculate_stand_mean, + standardise_data, + fit_model, +) from combat.utils.common_utils import check_NAs """ Load the shared variables """ batch = np.asarray([1, 1, 1, 2, 2, 3, 3, 3, 3]) -matrix = np.transpose([np.random.normal(size=1000, loc=3, scale=1), np.random.normal(size=1000, loc=3, scale=1), np.random.normal(size=1000, loc=3, scale=1), - np.random.normal(size=1000, loc=2, scale=0.6), np.random.normal(size=1000, loc=2, scale=0.6), - np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1)]) - -matrix = pd.DataFrame(data=matrix,columns=["sample_"+str(i+1) for i in range(9)],index=["gene_"+str(i+1) for i in range(1000)]) +matrix = np.transpose( + [ + np.random.normal(size=1000, loc=3, scale=1), + np.random.normal(size=1000, loc=3, scale=1), + np.random.normal(size=1000, loc=3, scale=1), + np.random.normal(size=1000, loc=2, scale=0.6), + np.random.normal(size=1000, loc=2, scale=0.6), + np.random.normal(size=1000, loc=4, scale=1), + np.random.normal(size=1000, loc=4, scale=1), + np.random.normal(size=1000, loc=4, scale=1), + np.random.normal(size=1000, loc=4, scale=1), + ] +) + +matrix = pd.DataFrame( + data=matrix, + columns=["sample_" + str(i + 1) for i in range(9)], + index=["gene_" + str(i + 1) for i in range(1000)], +) ref_batch = None mean_only = False par_prior = False @@ -22,14 +50,27 @@ mod = [] dat = matrix.values batchmod = model_matrix(list(batch), intercept=False, drop_first=False) -ref,batchmod = check_ref_batch(ref_batch,batch,batchmod) +ref, batchmod = check_ref_batch(ref_batch, batch, batchmod) n_batch, batches, n_batches, n_array = treat_batches(batch) design = treat_covariates(batchmod, mod, ref, n_batch) NAs = check_NAs(dat) -B_hat, grand_mean, var_pooled = calculate_mean_var(design, batches, ref, dat, NAs, ref_batch, n_batches, n_batch, n_array) -stand_mean = calculate_stand_mean(grand_mean, n_array, design, n_batch,B_hat) +B_hat, grand_mean, var_pooled = calculate_mean_var( + design, batches, ref, dat, NAs, ref_batch, n_batches, n_batch, n_array +) +stand_mean = calculate_stand_mean(grand_mean, n_array, design, n_batch, B_hat) s_data = standardise_data(dat, stand_mean, var_pooled, n_array) -gamma_star, delta_star, batch_design = fit_model(design,n_batch,s_data, batches, mean_only, par_prior, precision, ref_batch, ref, NAs) +gamma_star, delta_star, batch_design = fit_model( + design, + n_batch, + s_data, + batches, + mean_only, + par_prior, + precision, + ref_batch, + ref, + NAs, +) # test for compute_prior @@ -47,7 +88,6 @@ def test_model_matrix(): assert list(model_matrix_test[0]) == [1.0, 1.0] - # test for compute_prior def test_compute_prior(): print("aprior", compute_prior("a", gamma_star, False)) @@ -58,7 +98,9 @@ def test_compute_prior(): # test for postmean def test_postmean(): - assert np.shape(postmean(gamma_star, delta_star, gamma_star, delta_star)) == np.shape(gamma_star) + assert np.shape( + postmean(gamma_star, delta_star, gamma_star, delta_star) + ) == np.shape(gamma_star) # test for postvar @@ -91,7 +133,7 @@ def test_check_ref_batch(): assert check_ref_batch(None, batch, batchmod) == (None, batchmod) -# test for treat_batches +# test for treat_batches def test_treat_batches(): assert n_batch == 3 assert batches[0].tolist() == [0, 1, 2] @@ -99,7 +141,7 @@ def test_treat_batches(): assert batches[2].tolist() == [5, 6, 7, 8] assert n_batches == [3, 2, 4] assert n_array == 9 - + # test for treat_covariates def test_treat_covariates(): diff --git a/tests/test_common_utils.py b/tests/test_common_utils.py index 4cf4eee..376a7ab 100644 --- a/tests/test_common_utils.py +++ b/tests/test_common_utils.py @@ -13,7 +13,9 @@ def test_check_all_ones(): assert check_all_ones(np.array([1, 1, 1, 1, 0])) == False assert check_all_ones(np.array([0, 0, 0, 0, 0])) == False - assert check_all_ones(np.array([1.5, 0.5, 1, 1, 1])) == False # This test to show the limit of the method we use + assert ( + check_all_ones(np.array([1.5, 0.5, 1, 1, 1])) == False + ) # This test to show the limit of the method we use # test for check_mean_only @@ -25,5 +27,5 @@ def test_check_mean_only(): # test for check_NAs def test_check_NAs(): - assert check_NAs([0,1,2]) == False - assert check_NAs([0,np.nan,2]) == True \ No newline at end of file + assert check_NAs([0, 1, 2]) == False + assert check_NAs([0, np.nan, 2]) == True diff --git a/tests/test_pycombat.py b/tests/test_pycombat.py index 1cf6981..fe238e8 100644 --- a/tests/test_pycombat.py +++ b/tests/test_pycombat.py @@ -1,4 +1,4 @@ -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Copyright (C) 2019-2020 A. Behdenna, A. Nordor, J. Haziza and A. Gema # This program is free software: you can redistribute it and/or modify @@ -19,9 +19,7 @@ # file pycombat.py # author A. Behdenna, J. Haziza, A. Gema, A. Nordor # date Sept 2020 -#----------------------------------------------------------------------------- - - +# ----------------------------------------------------------------------------- # this file is only used for unit testing @@ -42,11 +40,25 @@ # Define constants for unit testing batch = np.asarray([1, 1, 1, 2, 2, 3, 3, 3, 3]) -matrix = np.transpose([np.random.normal(size=1000, loc=3, scale=1), np.random.normal(size=1000, loc=3, scale=1), np.random.normal(size=1000, loc=3, scale=1), - np.random.normal(size=1000, loc=2, scale=0.6), np.random.normal(size=1000, loc=2, scale=0.6), - np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1), np.random.normal(size=1000, loc=4, scale=1)]) - -matrix = pd.DataFrame(data=matrix,columns=["sample_"+str(i+1) for i in range(9)],index=["gene_"+str(i+1) for i in range(1000)]) +matrix = np.transpose( + [ + np.random.normal(size=1000, loc=3, scale=1), + np.random.normal(size=1000, loc=3, scale=1), + np.random.normal(size=1000, loc=3, scale=1), + np.random.normal(size=1000, loc=2, scale=0.6), + np.random.normal(size=1000, loc=2, scale=0.6), + np.random.normal(size=1000, loc=4, scale=1), + np.random.normal(size=1000, loc=4, scale=1), + np.random.normal(size=1000, loc=4, scale=1), + np.random.normal(size=1000, loc=4, scale=1), + ] +) + +matrix = pd.DataFrame( + data=matrix, + columns=["sample_" + str(i + 1) for i in range(9)], + index=["gene_" + str(i + 1) for i in range(1000)], +) print("Matrix and batch generated.") @@ -57,6 +69,7 @@ ########## # local tests before unit testing + def test_means(): print(f"mean matrix: {np.mean(matrix)}") print(f"mean matrix (adjusted): {np.mean(matrix_adjusted)}") @@ -68,5 +81,8 @@ def test_means(): # general tests on pyComBat def test_pycombat(): assert np.shape(matrix) == np.shape(matrix_adjusted) - assert abs(np.mean(matrix.values)-np.mean(matrix_adjusted.values)) <= np.mean(matrix.values)*0.05 + assert ( + abs(np.mean(matrix.values) - np.mean(matrix_adjusted.values)) + <= np.mean(matrix.values) * 0.05 + ) assert np.var(matrix_adjusted.values) <= np.var(matrix.values) From 1a3f82f9e65a044b5884ccc88aa35b93df73f9dc Mon Sep 17 00:00:00 2001 From: aryo Date: Thu, 29 Jul 2021 15:39:34 +0200 Subject: [PATCH 05/12] add code coverage to poetry --- poetry.lock | 117 ++++++++++++++++++++++++++++++++++++------------- pyproject.toml | 1 + 2 files changed, 87 insertions(+), 31 deletions(-) diff --git a/poetry.lock b/poetry.lock index a9f856a..e69ddf7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -72,6 +72,17 @@ category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "coverage" +version = "5.5" +description = "Code coverage measurement for Python" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" + +[package.extras] +toml = ["toml"] + [[package]] name = "importlib-metadata" version = "1.7.0" @@ -192,28 +203,6 @@ importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} [package.extras] dev = ["pre-commit", "tox"] -[[package]] -name = "poetry-core" -version = "1.0.3" -description = "Poetry PEP 517 Build Backend" -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - -[package.dependencies] -importlib-metadata = {version = ">=1.7.0,<2.0.0", markers = "python_version >= \"2.7\" and python_version < \"2.8\" or python_version >= \"3.5\" and python_version < \"3.8\""} - -[[package]] -name = "poetry2setup" -version = "1.0.0" -description = "Convert python-poetry to setup.py" -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - -[package.dependencies] -poetry-core = ">=1.0.0,<2.0.0" - [[package]] name = "py" version = "1.10.0" @@ -252,6 +241,22 @@ toml = "*" [package.extras] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] +[[package]] +name = "pytest-cov" +version = "2.12.1" +description = "Pytest plugin for measuring coverage." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.dependencies] +coverage = ">=5.2.1" +pytest = ">=4.6" +toml = "*" + +[package.extras] +testing = ["fields", "hunter", "process-tests", "six", "pytest-xdist", "virtualenv"] + [[package]] name = "python-dateutil" version = "2.8.2" @@ -370,7 +375,7 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytes [metadata] lock-version = "1.1" python-versions = "^3.7" -content-hash = "b8858bb42035be5883fcf2dc212b086044f13dbf3cee658b65c04f580aa7637c" +content-hash = "462ae7d5a53adc665574ba480401a2d692d281ec83e2f34f50a030e959ec6506" [metadata.files] appdirs = [ @@ -397,6 +402,60 @@ colorama = [ {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, ] +coverage = [ + {file = "coverage-5.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:b6d534e4b2ab35c9f93f46229363e17f63c53ad01330df9f2d6bd1187e5eaacf"}, + {file = "coverage-5.5-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:b7895207b4c843c76a25ab8c1e866261bcfe27bfaa20c192de5190121770672b"}, + {file = "coverage-5.5-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:c2723d347ab06e7ddad1a58b2a821218239249a9e4365eaff6649d31180c1669"}, + {file = "coverage-5.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:900fbf7759501bc7807fd6638c947d7a831fc9fdf742dc10f02956ff7220fa90"}, + {file = "coverage-5.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:004d1880bed2d97151facef49f08e255a20ceb6f9432df75f4eef018fdd5a78c"}, + {file = "coverage-5.5-cp27-cp27m-win32.whl", hash = "sha256:06191eb60f8d8a5bc046f3799f8a07a2d7aefb9504b0209aff0b47298333302a"}, + {file = "coverage-5.5-cp27-cp27m-win_amd64.whl", hash = "sha256:7501140f755b725495941b43347ba8a2777407fc7f250d4f5a7d2a1050ba8e82"}, + {file = "coverage-5.5-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:372da284cfd642d8e08ef606917846fa2ee350f64994bebfbd3afb0040436905"}, + {file = "coverage-5.5-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:8963a499849a1fc54b35b1c9f162f4108017b2e6db2c46c1bed93a72262ed083"}, + {file = "coverage-5.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:869a64f53488f40fa5b5b9dcb9e9b2962a66a87dab37790f3fcfb5144b996ef5"}, + {file = "coverage-5.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:4a7697d8cb0f27399b0e393c0b90f0f1e40c82023ea4d45d22bce7032a5d7b81"}, + {file = "coverage-5.5-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:8d0a0725ad7c1a0bcd8d1b437e191107d457e2ec1084b9f190630a4fb1af78e6"}, + {file = "coverage-5.5-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:51cb9476a3987c8967ebab3f0fe144819781fca264f57f89760037a2ea191cb0"}, + {file = "coverage-5.5-cp310-cp310-win_amd64.whl", hash = "sha256:c0891a6a97b09c1f3e073a890514d5012eb256845c451bd48f7968ef939bf4ae"}, + {file = "coverage-5.5-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3487286bc29a5aa4b93a072e9592f22254291ce96a9fbc5251f566b6b7343cdb"}, + {file = "coverage-5.5-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:deee1077aae10d8fa88cb02c845cfba9b62c55e1183f52f6ae6a2df6a2187160"}, + {file = "coverage-5.5-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:f11642dddbb0253cc8853254301b51390ba0081750a8ac03f20ea8103f0c56b6"}, + {file = "coverage-5.5-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:6c90e11318f0d3c436a42409f2749ee1a115cd8b067d7f14c148f1ce5574d701"}, + {file = "coverage-5.5-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:30c77c1dc9f253283e34c27935fded5015f7d1abe83bc7821680ac444eaf7793"}, + {file = "coverage-5.5-cp35-cp35m-win32.whl", hash = "sha256:9a1ef3b66e38ef8618ce5fdc7bea3d9f45f3624e2a66295eea5e57966c85909e"}, + {file = "coverage-5.5-cp35-cp35m-win_amd64.whl", hash = "sha256:972c85d205b51e30e59525694670de6a8a89691186012535f9d7dbaa230e42c3"}, + {file = "coverage-5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:af0e781009aaf59e25c5a678122391cb0f345ac0ec272c7961dc5455e1c40066"}, + {file = "coverage-5.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:74d881fc777ebb11c63736622b60cb9e4aee5cace591ce274fb69e582a12a61a"}, + {file = "coverage-5.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:92b017ce34b68a7d67bd6d117e6d443a9bf63a2ecf8567bb3d8c6c7bc5014465"}, + {file = "coverage-5.5-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:d636598c8305e1f90b439dbf4f66437de4a5e3c31fdf47ad29542478c8508bbb"}, + {file = "coverage-5.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:41179b8a845742d1eb60449bdb2992196e211341818565abded11cfa90efb821"}, + {file = "coverage-5.5-cp36-cp36m-win32.whl", hash = "sha256:040af6c32813fa3eae5305d53f18875bedd079960822ef8ec067a66dd8afcd45"}, + {file = "coverage-5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:5fec2d43a2cc6965edc0bb9e83e1e4b557f76f843a77a2496cbe719583ce8184"}, + {file = "coverage-5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:18ba8bbede96a2c3dde7b868de9dcbd55670690af0988713f0603f037848418a"}, + {file = "coverage-5.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:2910f4d36a6a9b4214bb7038d537f015346f413a975d57ca6b43bf23d6563b53"}, + {file = "coverage-5.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f0b278ce10936db1a37e6954e15a3730bea96a0997c26d7fee88e6c396c2086d"}, + {file = "coverage-5.5-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:796c9c3c79747146ebd278dbe1e5c5c05dd6b10cc3bcb8389dfdf844f3ead638"}, + {file = "coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:53194af30d5bad77fcba80e23a1441c71abfb3e01192034f8246e0d8f99528f3"}, + {file = "coverage-5.5-cp37-cp37m-win32.whl", hash = "sha256:184a47bbe0aa6400ed2d41d8e9ed868b8205046518c52464fde713ea06e3a74a"}, + {file = "coverage-5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2949cad1c5208b8298d5686d5a85b66aae46d73eec2c3e08c817dd3513e5848a"}, + {file = "coverage-5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:217658ec7187497e3f3ebd901afdca1af062b42cfe3e0dafea4cced3983739f6"}, + {file = "coverage-5.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1aa846f56c3d49205c952d8318e76ccc2ae23303351d9270ab220004c580cfe2"}, + {file = "coverage-5.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:24d4a7de75446be83244eabbff746d66b9240ae020ced65d060815fac3423759"}, + {file = "coverage-5.5-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:d1f8bf7b90ba55699b3a5e44930e93ff0189aa27186e96071fac7dd0d06a1873"}, + {file = "coverage-5.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:970284a88b99673ccb2e4e334cfb38a10aab7cd44f7457564d11898a74b62d0a"}, + {file = "coverage-5.5-cp38-cp38-win32.whl", hash = "sha256:01d84219b5cdbfc8122223b39a954820929497a1cb1422824bb86b07b74594b6"}, + {file = "coverage-5.5-cp38-cp38-win_amd64.whl", hash = "sha256:2e0d881ad471768bf6e6c2bf905d183543f10098e3b3640fc029509530091502"}, + {file = "coverage-5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d1f9ce122f83b2305592c11d64f181b87153fc2c2bbd3bb4a3dde8303cfb1a6b"}, + {file = "coverage-5.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:13c4ee887eca0f4c5a247b75398d4114c37882658300e153113dafb1d76de529"}, + {file = "coverage-5.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:52596d3d0e8bdf3af43db3e9ba8dcdaac724ba7b5ca3f6358529d56f7a166f8b"}, + {file = "coverage-5.5-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2cafbbb3af0733db200c9b5f798d18953b1a304d3f86a938367de1567f4b5bff"}, + {file = "coverage-5.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:44d654437b8ddd9eee7d1eaee28b7219bec228520ff809af170488fd2fed3e2b"}, + {file = "coverage-5.5-cp39-cp39-win32.whl", hash = "sha256:d314ed732c25d29775e84a960c3c60808b682c08d86602ec2c3008e1202e3bb6"}, + {file = "coverage-5.5-cp39-cp39-win_amd64.whl", hash = "sha256:13034c4409db851670bc9acd836243aeee299949bd5673e11844befcb0149f03"}, + {file = "coverage-5.5-pp36-none-any.whl", hash = "sha256:f030f8873312a16414c0d8e1a1ddff2d3235655a2174e3648b4fa66b3f2f1079"}, + {file = "coverage-5.5-pp37-none-any.whl", hash = "sha256:2a3859cb82dcbda1cfd3e6f71c27081d18aa251d20a17d87d26d4cd216fb0af4"}, + {file = "coverage-5.5.tar.gz", hash = "sha256:ebe78fe9a0e874362175b02371bdfbee64d8edc42a044253ddf4ee7d3c15212c"}, +] importlib-metadata = [ {file = "importlib_metadata-1.7.0-py2.py3-none-any.whl", hash = "sha256:dc15b2969b4ce36305c51eebe62d418ac7791e9a157911d58bfb1f9ccd8e2070"}, {file = "importlib_metadata-1.7.0.tar.gz", hash = "sha256:90bb658cdbbf6d1735b6341ce708fc7024a3e14e99ffdc5783edea9f9b077f83"}, @@ -489,14 +548,6 @@ pluggy = [ {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, ] -poetry-core = [ - {file = "poetry-core-1.0.3.tar.gz", hash = "sha256:2315c928249fc3207801a81868b64c66273077b26c8d8da465dccf8f488c90c5"}, - {file = "poetry_core-1.0.3-py2.py3-none-any.whl", hash = "sha256:c6bde46251112de8384013e1ab8d66e7323d2c75172f80220aba2bc07e208e9a"}, -] -poetry2setup = [ - {file = "poetry2setup-1.0.0-py2.py3-none-any.whl", hash = "sha256:b69a4efabfda24870fd8d08b37b15fbbee5eec0b62711feaef610f94835517be"}, - {file = "poetry2setup-1.0.0.tar.gz", hash = "sha256:ef1177303996b661eeec3de5027a1af84e106a1d2cb92c73152fe6ce47700cc3"}, -] py = [ {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, @@ -509,6 +560,10 @@ pytest = [ {file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"}, {file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"}, ] +pytest-cov = [ + {file = "pytest-cov-2.12.1.tar.gz", hash = "sha256:261ceeb8c227b726249b376b8526b600f38667ee314f910353fa318caa01f4d7"}, + {file = "pytest_cov-2.12.1-py2.py3-none-any.whl", hash = "sha256:261bb9e47e65bd099c89c3edf92972865210c36813f80ede5277dceb77a4a62a"}, +] python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, diff --git a/pyproject.toml b/pyproject.toml index ac2410f..b4fcbb3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ typing-extensions = "^3.10.0" [tool.poetry.dev-dependencies] pytest = "^6.2.4" black = "^21.5b1" +pytest-cov = "^2.12.1" [build-system] requires = ["poetry-core>=1.0.0"] From b08f2978f7297964e95ebce763d1776660bce11f Mon Sep 17 00:00:00 2001 From: aryo Date: Tue, 3 Aug 2021 12:09:25 +0200 Subject: [PATCH 06/12] FIxed duplicated documentation --- combat/utils/combat_utils.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/combat/utils/combat_utils.py b/combat/utils/combat_utils.py index 6dc6ea6..265de07 100644 --- a/combat/utils/combat_utils.py +++ b/combat/utils/combat_utils.py @@ -44,14 +44,6 @@ def compute_prior( -> prior parameters used to estimate the prior gamma distribution for multiplicative batch effect aprior - calculates empirical hyper-prior values - Arguments: - prior {char} -- 'a' or 'b' depending of the prior to be calculated - gamma_hat {matrix} -- matrix of additive batch effect - mean_only {bool} -- True iff mean_only selected - - Returns: - float -- [the prior calculated (aprior or bprior) - Args: prior (str): 'a' or 'b' depending of the prior to be calculated gamma_hat (np.array): [description] From 83567f759d0c9e0bf5008c68b1be775495ecc9fe Mon Sep 17 00:00:00 2001 From: aryo Date: Wed, 29 Sep 2021 16:00:38 +0200 Subject: [PATCH 07/12] re-add pycombat method as a wrapper of fit_transform --- combat/__init__.py | 1 + combat/pycombat.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/combat/__init__.py b/combat/__init__.py index 560be3d..0aabe1f 100644 --- a/combat/__init__.py +++ b/combat/__init__.py @@ -1 +1,2 @@ from .pycombat import PyCombat +from .pycombat import pycombat \ No newline at end of file diff --git a/combat/pycombat.py b/combat/pycombat.py index 005969c..e84ef0d 100644 --- a/combat/pycombat.py +++ b/combat/pycombat.py @@ -231,3 +231,26 @@ def transform(self, data: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame( bayes_data, columns=self.list_samples, index=self.list_genes ) + + +def pycombat( + data: pd.DataFrame, + batch: list, + mod: list = [], + par_prior: bool = True, + prior_plots: bool = False, + mean_only: bool = False, + ref_batch: int = None, + precision: float = None, + **kwargs +): + return PyCombat().fit_transform( + data, + batch, + mod=mod, + par_prior=par_prior, + prior_plots=prior_plots, + mean_only=mean_only, + ref_batch=ref_batch, + precision=precision + ) \ No newline at end of file From e12279661bbea4991d56731613b072417d1d1492 Mon Sep 17 00:00:00 2001 From: aryo Date: Wed, 29 Sep 2021 16:01:15 +0200 Subject: [PATCH 08/12] add sys.path configuration to unit tests scripts --- tests/test_combat_utils.py | 5 +++++ tests/test_common_utils.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/tests/test_combat_utils.py b/tests/test_combat_utils.py index 561ce19..2b09e07 100644 --- a/tests/test_combat_utils.py +++ b/tests/test_combat_utils.py @@ -1,3 +1,8 @@ +import os +import sys + +sys.path.append(os.getcwd()) + import pytest import numpy as np diff --git a/tests/test_common_utils.py b/tests/test_common_utils.py index 376a7ab..c028156 100644 --- a/tests/test_common_utils.py +++ b/tests/test_common_utils.py @@ -1,3 +1,8 @@ +import os +import sys + +sys.path.append(os.getcwd()) + import pytest import numpy as np From c3dab7f3da242c08b1e272c1703b05e3fb126174 Mon Sep 17 00:00:00 2001 From: aryo Date: Wed, 29 Sep 2021 16:02:09 +0200 Subject: [PATCH 09/12] separate unit tests for pycombat method-based and object-based --- tests/test_pycombat_method.py | 93 +++++++++++++++++++ ...st_pycombat.py => test_pycombat_object.py} | 5 + 2 files changed, 98 insertions(+) create mode 100644 tests/test_pycombat_method.py rename tests/{test_pycombat.py => test_pycombat_object.py} (98%) diff --git a/tests/test_pycombat_method.py b/tests/test_pycombat_method.py new file mode 100644 index 0000000..b998a1a --- /dev/null +++ b/tests/test_pycombat_method.py @@ -0,0 +1,93 @@ +# ----------------------------------------------------------------------------- +# Copyright (C) 2019-2020 A. Behdenna, A. Nordor, J. Haziza and A. Gema + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# For more information, please contact Abdelkader Behdenna / + +# file pycombat.py +# author A. Behdenna, J. Haziza, A. Gema, A. Nordor +# date Sept 2020 +# ----------------------------------------------------------------------------- + + +# this file is only used for unit testing +# We import the function that will be tested one by one, incrementally +# Generates a report about the functions tested + +import os +import sys + +sys.path.append(os.getcwd()) + +import numpy as np +import pandas as pd + +########## +# import function used for unit testing +from combat import pycombat + +########## +print("\n#### Unit Testing for pyComBat ####\n") + +########## +# Define constants for unit testing + +batch = np.asarray([1, 1, 1, 2, 2, 3, 3, 3, 3]) +matrix = np.transpose( + [ + np.random.normal(size=1000, loc=3, scale=1), + np.random.normal(size=1000, loc=3, scale=1), + np.random.normal(size=1000, loc=3, scale=1), + np.random.normal(size=1000, loc=2, scale=0.6), + np.random.normal(size=1000, loc=2, scale=0.6), + np.random.normal(size=1000, loc=4, scale=1), + np.random.normal(size=1000, loc=4, scale=1), + np.random.normal(size=1000, loc=4, scale=1), + np.random.normal(size=1000, loc=4, scale=1), + ] +) + +matrix = pd.DataFrame( + data=matrix, + columns=["sample_" + str(i + 1) for i in range(9)], + index=["gene_" + str(i + 1) for i in range(1000)], +) + +print("Matrix and batch generated.") + +matrix_adjusted = pycombat(matrix, batch) + +print("Adjusted matrix generated.") + +########## +# local tests before unit testing + + +def test_means(): + print(f"mean matrix: {np.mean(matrix)}") + print(f"mean matrix (adjusted): {np.mean(matrix_adjusted)}") + print("**********") + print(f"var matrix: {np.var(matrix)}") + print(f"var matrix (adjusted): {np.var(matrix_adjusted)}") + + +# general tests on pyComBat +def test_pycombat(): + assert np.shape(matrix) == np.shape(matrix_adjusted) + assert ( + abs(np.mean(matrix.values) - np.mean(matrix_adjusted.values)) + <= np.mean(matrix.values) * 0.05 + ) + assert np.var(matrix_adjusted.values) <= np.var(matrix.values) diff --git a/tests/test_pycombat.py b/tests/test_pycombat_object.py similarity index 98% rename from tests/test_pycombat.py rename to tests/test_pycombat_object.py index fe238e8..e95f78b 100644 --- a/tests/test_pycombat.py +++ b/tests/test_pycombat_object.py @@ -26,6 +26,11 @@ # We import the function that will be tested one by one, incrementally # Generates a report about the functions tested +import os +import sys + +sys.path.append(os.getcwd()) + import numpy as np import pandas as pd From c8a7b0f2574ae991c206cad84df06ce71545ef24 Mon Sep 17 00:00:00 2001 From: aryo Date: Wed, 29 Sep 2021 16:13:33 +0200 Subject: [PATCH 10/12] fix minor styling issue --- combat/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/combat/__init__.py b/combat/__init__.py index 0aabe1f..5d29a8c 100644 --- a/combat/__init__.py +++ b/combat/__init__.py @@ -1,2 +1 @@ -from .pycombat import PyCombat -from .pycombat import pycombat \ No newline at end of file +from .pycombat import PyCombat, pycombat \ No newline at end of file From 5a873ab2464fe3267895a62b3a351a7bd372eab9 Mon Sep 17 00:00:00 2001 From: aryo Date: Wed, 29 Sep 2021 16:14:03 +0200 Subject: [PATCH 11/12] Black reformatting --- combat/__init__.py | 2 +- combat/pycombat.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/combat/__init__.py b/combat/__init__.py index 5d29a8c..f4bbe06 100644 --- a/combat/__init__.py +++ b/combat/__init__.py @@ -1 +1 @@ -from .pycombat import PyCombat, pycombat \ No newline at end of file +from .pycombat import PyCombat, pycombat diff --git a/combat/pycombat.py b/combat/pycombat.py index e84ef0d..0c2dcad 100644 --- a/combat/pycombat.py +++ b/combat/pycombat.py @@ -242,7 +242,7 @@ def pycombat( mean_only: bool = False, ref_batch: int = None, precision: float = None, - **kwargs + **kwargs, ): return PyCombat().fit_transform( data, @@ -252,5 +252,5 @@ def pycombat( prior_plots=prior_plots, mean_only=mean_only, ref_batch=ref_batch, - precision=precision - ) \ No newline at end of file + precision=precision, + ) From c6a697a533e62c385b9bdd0f35b76c621035f776 Mon Sep 17 00:00:00 2001 From: aryo Date: Wed, 29 Sep 2021 18:17:36 +0200 Subject: [PATCH 12/12] Replicating changes from PR #18 --- combat/utils/combat_utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/combat/utils/combat_utils.py b/combat/utils/combat_utils.py index 265de07..cab48f6 100644 --- a/combat/utils/combat_utils.py +++ b/combat/utils/combat_utils.py @@ -208,16 +208,18 @@ def int_eprior( # only if precision parameter informed # increase the precision of the computing (if negative exponential too close to 0) mp.dps = precision - buf_exp = list(map(mp.exp, np.negative(sum2) / (temp_2d))) - buf_pow = list(map(partial(mp.power, y=n / 2), 1 / (np.pi * temp_2d))) + buf_exp = np.array(list(map(mp.exp, np.negative(sum2)/(temp_2d)))) + buf_pow = np.array(list(map(partial(mp.power, y=n/2), 1/(np.pi*temp_2d)))) LH = buf_pow * buf_exp # likelihood # /end{handling high precision computing} LH = np.nan_to_num(LH) # corrects NaNs in likelihood - if np.sum(LH) == 0 and test_approximation == 0: # correction for LH full of 0.0 + if np.sum(LH) == 0 and test_approximation == 0: test_approximation = 1 # this message won't appear again print( "###\nValues too small, approximation applied to avoid division by 0.\nPrecision mode can correct this problem, but increases computation time.\n###" ) + + if np.sum(LH) == 0: # correction for LH full of 0.0 LH[LH == 0] = np.exp(-745) g_star.append(np.sum(g * LH) / np.sum(LH)) d_star.append(np.sum(d * LH) / np.sum(LH)) @@ -432,7 +434,7 @@ def treat_batches(batch: list) -> Tuple[int, List[int], List[int], int]: batches = [] # list of lists, contains the list of position for each batch for i in range(n_batch): batches.append(np.where(batch == np.unique(batch)[i])[0]) - batches = np.asarray(batches) + batches = np.asarray(batches, dtype=np.int32) n_batches = list(map(len, batches)) if 1 in n_batches: # mean_only = True # no variance if only one sample in a batch - mean_only has to be used