diff --git a/.gitignore b/.gitignore index b6e47617..d4e65d24 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,12 @@ dmypy.json # Pyre type checker .pyre/ + +# Pycharm +.idea/* + +# Development directory +__dev + +# Binary files +libs/ccc/sklearn/metrics diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..55002b4a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,20 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml + - id: check-added-large-files + - id: mixed-line-ending + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.8.5 # Ruff version. + hooks: + - id: ruff # Run the linter. + types_or: [python, pyi] + args: [--fix] + - id: ruff-format # Run the formatter. + types_or: [python, pyi] diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..6dffd85a --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,35 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.12" + # You can also specify other tool versions: + # nodejs: "20" + # rust: "1.70" + # golang: "1.20" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/source/conf.py + # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs + # builder: "dirhtml" + # Fail on all warnings to avoid broken references + # fail_on_warning: true + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +# python: +# install: +# - requirements: docs/requirements.txt \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..a215e3db --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,100 @@ +cmake_minimum_required(VERSION 3.15...3.26) +project(${SKBUILD_PROJECT_NAME} LANGUAGES CUDA CXX) + +# Add this near the top of your file, after project() +# Define the include directories for the whole project +set(PROJECT_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/libs) +# Set extention name and source directory +set(CUDA_EXT_MODULE_NAME ccc_cuda_ext) +set(CUDA_EXT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/libs/${CUDA_EXT_MODULE_NAME}) + +# Set C++ standard +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Set Python Standard +# Get Python version dynamically +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}')" + OUTPUT_VARIABLE PYTHON_VERSION_FULL + OUTPUT_STRIP_TRAILING_WHITESPACE +) +# Set the paths using the detected version +set(Python_EXECUTABLE $ENV{CONDA_PREFIX}/bin/python) +set(PYTHON_INCLUDE_DIR $ENV{CONDA_PREFIX}/include/python${PYTHON_VERSION_FULL}) +set(PYTHON_LIBRARY $ENV{CONDA_PREFIX}/lib/libpython${PYTHON_VERSION_FULL}.so) + + +# Set CUDA architecture and Pybind11 +find_package(Python REQUIRED Development) +set(PYBIND11_NEWPYTHON ON) +find_package(pybind11 CONFIG REQUIRED) + +# Download and configure Google Test +include(FetchContent) +FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG v1.15.2 # Adjust version as needed +) +FetchContent_MakeAvailable(googletest) + +# Setup Gtest +enable_testing() +# Function to automatically add tests from a directory +function(add_tests_from_directory TEST_DIR) + # Find all test files in the directory + file(GLOB_RECURSE TEST_FILES + "${TEST_DIR}/*_test.cpp" # Files ending with _test.cpp + "${TEST_DIR}/*_tests.cpp" # Files ending with _tests.cpp + "${TEST_DIR}/test_*.cpp" # Files starting with test_ + ) + + # Loop through each test file + foreach(TEST_FILE ${TEST_FILES}) + # Get the filename without extension + get_filename_component(TEST_NAME ${TEST_FILE} NAME_WE) + + # Create an executable for this test + add_executable(${TEST_NAME} ${TEST_FILE} ${CUDA_EXT_DIR}/metrics.cu) + + # target_include_directories(${TEST_NAME} PRIVATE + # ${PROJECT_INCLUDE_DIR} # Add this line + # ${Python_INCLUDE_DIRS} + # ) + + # Link against gtest and your project libraries + target_link_libraries(${TEST_NAME} PRIVATE + GTest::gtest_main + GTest::gtest + pybind11::headers + pybind11::embed + Python::Python + # Add your other project libraries here + # project_lib + ) + + # Add the test to CTest + add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) + + # Set test properties (optional) + # Set test properties (optional) + set_tests_properties(${TEST_NAME} PROPERTIES + TIMEOUT 10 # Timeout in seconds + WORKING_DIRECTORY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}" + ) + endforeach() +endfunction() + +# Specify your test directory and call the function +add_tests_from_directory(${CMAKE_CURRENT_SOURCE_DIR}/tests) + +# Optional: Set output directories +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) + + +# Set up binding then do compilation and installation +pybind11_add_module(${CUDA_EXT_MODULE_NAME} ${CUDA_EXT_DIR}/binder.cu ${CUDA_EXT_DIR}/metrics.cu) + +install(TARGETS ${CUDA_EXT_MODULE_NAME} LIBRARY DESTINATION .) diff --git a/README.md b/README.md index f0685fbf..0466eab8 100644 --- a/README.md +++ b/README.md @@ -1,321 +1,46 @@ -# Clustermatch Correlation Coefficient (CCC) +# Clustermatch Correlation Coefficient GPU (CCC-GPU) -[![Code tests](https://github.com/greenelab/ccc/actions/workflows/pytest.yaml/badge.svg)](https://github.com/greenelab/ccc/actions/workflows/pytest.yaml) -[![codecov](https://codecov.io/gh/greenelab/ccc/branch/main/graph/badge.svg?token=QNK6O3Y1VF)](https://codecov.io/gh/greenelab/ccc) -[![bioRxiv Manuscript](https://img.shields.io/badge/manuscript-bioRxiv-blue.svg)](https://doi.org/10.1101/2022.06.15.496326) -[![HTML Manuscript](https://img.shields.io/badge/manuscript-HTML-blue.svg)](https://greenelab.github.io/ccc-manuscript/) +## Development +[Scikit-build](https://scikit-build-core.readthedocs.io/en/latest/getting_started.html) is used to build the C++ CUDA extension module and its tests. -## Overview - -The Clustermatch Correlation Coefficient (CCC) is a highly-efficient, next-generation not-only-linear correlation coefficient that can work on numerical and categorical data types. -This repository contains the code of CCC and instructions to install and use it. -It also has all the scripts/notebooks to run the analyses associated with the [manuscript](https://github.com/greenelab/ccc-manuscript), where we applied CCC on gene expression data. - -## Installation - -CCC is available as a PyPI (Python) package (`ccc-coef`). We tested CCC in Python 3.9+, but it should work on prior 3.x versions. -You can quickly test it by creating a conda environment and then install it with `pip`: - -```bash -# ipython and pandas are used in the following examples, but they are not needed for CCC to work -conda create -y -n ccc-env python=3.9 ipython pandas -conda activate ccc-env -pip install ccc-coef +### How to set up the development environment +At the root of the repository, run: ``` - -## Usage - -Run `ipython` in your terminal: -```bash -$ ipython -Python 3.10.4 (main, Mar 31 2022, 08:41:55) [GCC 7.5.0] -Type 'copyright', 'credits' or 'license' for more information -IPython 8.3.0 -- An enhanced Interactive Python. Type '?' for help. - -In [1]: +conda env create -f environment-cuda.yml ``` -When computing the correlation coefficient on a pair of features, CCC supports `numpy.array` or `pandas.Series`. -Missing values (`NaN`) are not currently supported, so you have to either remove or impute them. -Below there is an example with numerical data (you can copy/paste the entire lines below including `In [...]`): - -```python -In [1]: import numpy as np -In [2]: import pandas as pd -In [3]: from ccc.coef import ccc - -In [4]: random_feature1 = np.random.rand(1000) -In [5]: random_feature2 = np.random.rand(1000) -In [6]: ccc(random_feature1, random_feature2) -Out[6]: 0.0018815884476534295 - -In [7]: random_feature1 = pd.Series(random_feature1) -In [8]: random_feature2 = pd.Series(random_feature2) -In [9]: ccc(random_feature1, random_feature2) -Out[9]: 0.0018815884476534295 +### How to activate the development environment +At the root of the repository, run: ``` - -CCC always returns a value between zero (no relationship) and one (perfect relationship). -[As we show in the manuscript](https://greenelab.github.io/ccc-manuscript/#the-ccc-reveals-linear-and-nonlinear-patterns-in-human-transcriptomic-data), the distribution of CCC values is much more skewed than other coefficients like Pearson's or Spearman's. -A comparison between these coefficients should account for that. - -You can also mix numerical and categorical data: - -```python -In [10]: categories = np.array(["blue", "red", "green", "yellow"]) -In [11]: categorical_random_feature1 = np.random.choice(categories, size=1000) -In [12]: categorical_random_feature2 = np.random.choice(categories, size=1000) -In [13]: categorical_random_feature2[:10] -Out[13]: -array(['yellow', 'red', 'red', 'yellow', 'blue', 'blue', 'red', 'yellow', - 'green', 'blue'], dtype='NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..937f3163 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,41 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'ccc-gpu' +copyright = '2025, Milton Pividori, Haoyu Zhang, Kevin Fotso' +author = 'Milton Pividori, Haoyu Zhang, Kevin Fotso' + + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +templates_path = ['_templates'] +exclude_patterns = [] + +extensions = [ + 'sphinx.ext.duration', + 'sphinx.ext.doctest', + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', +] + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3/', None), + 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), +} +intersphinx_disabled_domains = ['std'] + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'haiku' +html_static_path = ['_static'] + +# -- Options for EPUB output +epub_show_urls = 'footnote' diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..76a18c54 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,17 @@ +.. ccc-gpu documentation master file, created by + sphinx-quickstart on Thu Jan 9 15:14:14 2025. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +ccc-gpu documentation +===================== + +Add your content using ``reStructuredText`` syntax. See the +`reStructuredText `_ +documentation for details. + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + diff --git a/environment/README.md b/environment/README.md index 474fa738..8755c44b 100644 --- a/environment/README.md +++ b/environment/README.md @@ -83,7 +83,7 @@ cd environment/ 1. Create a conda environment and install main packages: ```bash -conda env create --name ccc --file environment.yml +conda env create --name ccc --file environment-cuda.yml conda run -n ccc --no-capture-output bash scripts/install_other_packages.sh ``` @@ -131,7 +131,7 @@ bash scripts/install_other_packages.sh 1. Export conda environment: ```bash -conda env export --name ccc --file environment.yml +conda env export --name ccc --file environment-cuda.yml ``` 1. Modify `environment.yml` and leave only manually installed packages (not their dependencies). diff --git a/environment/environment-cuda.yml b/environment/environment-cuda.yml new file mode 100644 index 00000000..3ff03b10 --- /dev/null +++ b/environment/environment-cuda.yml @@ -0,0 +1,23 @@ +name: ccc-gpu +channels: + - rapidsai + - conda-forge + - nvidia +dependencies: + - rapids=24.08 + - cuda-version>=12.0,<=12.5 + - cupy=13.* + - pip + - python=3.10 + - minepy + - pip: + - sphinx==8.* + - numba==0.60.* + - pytest==8.* + - pybind11==2.* + - ipython==8.* + - seaborn==0.13.* + - upsetplot==0.9.* + - numpy==2.* + - numba==0.60.* + \ No newline at end of file diff --git a/environment/environment.yml b/environment/environment.yml index 7c3738a4..4ad6ea0e 100644 --- a/environment/environment.yml +++ b/environment/environment.yml @@ -3,14 +3,16 @@ channels: - conda-forge - defaults dependencies: + - cudatoolkit=11.2.* + - cupy=13.2.* - ipython=7.* - ipywidgets - jupyterlab=3.3.* - jupytext=1.11.* - matplotlib=3.4.* - minepy=1.2.* - - numba=0.53.* - - numpy=1.21.* + - numba=0.60.* + - numpy=1.25.* - openpyxl=3.0.* - pandas=1.3.* - papermill=2.3.* @@ -27,7 +29,7 @@ dependencies: - r-svglite=2.* - rpy2=3.4.* - scikit-learn=0.24.* - - scipy=1.7.* + - scipy=1.9.* - seaborn=0.11.* - svgutils=0.3.* - tabulate=0.8.* diff --git a/libs/ccc/__init__.py b/libs/ccc/__init__.py index 36211dc8..f98e1d65 100644 --- a/libs/ccc/__init__.py +++ b/libs/ccc/__init__.py @@ -1,2 +1,4 @@ +from __future__ import annotations + # Remember to change also setup.py with the version here __version__ = "0.2.2" diff --git a/libs/ccc/coef/impl.py b/libs/ccc/coef/impl.py index 18532990..440692dc 100644 --- a/libs/ccc/coef/impl.py +++ b/libs/ccc/coef/impl.py @@ -816,4 +816,4 @@ def ccc( if pvalue_n_perms is not None and pvalue_n_perms > 0: return cm_values, cm_pvalues else: - return cm_values + return cm_values \ No newline at end of file diff --git a/libs/ccc/coef/impl_gpu.py b/libs/ccc/coef/impl_gpu.py new file mode 100644 index 00000000..18532990 --- /dev/null +++ b/libs/ccc/coef/impl_gpu.py @@ -0,0 +1,819 @@ +""" +Contains function that implement the Clustermatch Correlation Coefficient (CCC). +""" +from __future__ import annotations + +import os +from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor +from typing import Iterable, Union + +import numpy as np +from numpy.typing import NDArray +from numba import njit +from numba.typed import List + +from ccc.pytorch.core import unravel_index_2d +from ccc.sklearn.metrics import adjusted_rand_index as ari +from ccc.scipy.stats import rank +from ccc.utils import chunker, DummyExecutor + + +@njit(cache=True, nogil=True) +def get_perc_from_k(k: int) -> list[float]: + """ + It returns the percentiles (from 0.0 to 1.0) that separate the data into k + clusters. For example, if k=2, it returns [0.5]; if k=4, it returns [0.25, + 0.50, 0.75]. + + Args: + k: number of clusters. If less than 2, the function returns an empty + list. + + Returns: + A list of percentiles (from 0.0 to 1.0). + """ + return [(1.0 / k) * i for i in range(1, k)] + + +@njit(cache=True, nogil=True) +def run_quantile_clustering(data: NDArray, k: int) -> NDArray[np.int16]: + """ + Performs a simple quantile clustering on one dimensional data (1d). Quantile + clustering is defined as the procedure that forms clusters in 1d data by + separating objects using quantiles (for instance, if the median is used, two + clusters are generated with objects separated by the median). In the case + data contains all the same values (zero variance), this implementation can + return less clusters than specified with k. + + Args: + data: a 1d numpy array with numerical values. + k: the number of clusters to split the data into. + + Returns: + A 1d array with the data partition. + """ + data_sorted = np.argsort(data, kind="quicksort") + data_rank = rank(data, data_sorted) + data_perc = data_rank / len(data) + + percentiles = [0.0] + get_perc_from_k(k) + [1.0] + + cut_points = np.searchsorted(data_perc[data_sorted], percentiles, side="right") + + current_cluster = 0 + part = np.zeros(data.shape, dtype=np.int16) - 1 + + for i in range(len(cut_points) - 1): + lim1 = cut_points[i] + lim2 = cut_points[i + 1] + + part[data_sorted[lim1:lim2]] = current_cluster + current_cluster += 1 + + return part + + +@njit(cache=True, nogil=True) +def get_range_n_clusters( + n_features: int, internal_n_clusters: Iterable[int] = None +) -> NDArray[np.uint8]: + """ + Given the number of features it returns a tuple of k values to cluster those + features into. By default, it generates a tuple of k values from 2 to + int(np.round(np.sqrt(n_features))) (inclusive). For example, for 25 features, + it will generate this tuple: (2, 3, 4, 5). + + Args: + n_features: a positive number representing the number of features that + will be clustered into different groups/clusters. + internal_n_clusters: it allows to force a different list of clusters. It + must be a list of integers. Repeated or invalid values will be dropped, + such as values lesser than 2 (a singleton partition is not allowed). + + Returns: + A numpy array with integer values representing numbers of clusters. + """ + + if internal_n_clusters is not None: + # remove k values that are invalid + clusters_range_list = list( + set([int(x) for x in internal_n_clusters if 1 < x < n_features]) + ) + else: + # default behavior if no internal_n_clusters is given: return range from + # 2 to sqrt(n_features) + n_sqrt = int(np.round(np.sqrt(n_features))) + n_sqrt = min((n_sqrt, 10)) + clusters_range_list = list(range(2, n_sqrt + 1)) + + return np.array(clusters_range_list, dtype=np.uint16) + + +@njit(cache=True, nogil=True) +def get_parts( + data: NDArray, range_n_clusters: tuple[int], data_is_numerical: bool = True +) -> NDArray[np.int16]: + """ + Given a 1d data array, it computes a partition for each k value in the given + range of clusters. If partitions with only one cluster are returned (singletons), + then the returned array will have negative values. + + Args: + data: a 1d data vector. It is assumed that there are no nans. + range_n_clusters: a tuple with the number of clusters. + data_is_numerical: indicates whether data is numerical (True) or categorical (False) + + Returns: + A numpy array with shape (number of clusters, data rows) with + partitions of data. + + Partitions could have negative values in some scenarios, with different + meanings: -1 is used for categorical data, where only one partition is generated + and the rest (-1) are marked as "empty". -2 is used when singletons have been + detected (partitions with one cluster), usually because of problems with the + input data (it has all the same values, for example). + """ + parts = np.zeros((len(range_n_clusters), data.shape[0]), dtype=np.int16) - 1 + + if data_is_numerical: + for idx in range(len(range_n_clusters)): + k = range_n_clusters[idx] + parts[idx] = run_quantile_clustering(data, k) + + # remove singletons by putting a -2 as values + partitions_ks = np.array([len(np.unique(p)) for p in parts]) + parts[partitions_ks == 1, :] = -2 + else: + # if the data is categorical, then the encoded feature is already the partition + # only the first partition is filled, the rest will be -1 (missing) + parts[0] = data.astype(np.int16) + + return parts + + +def get_feature_parts(params): + """ + Given a list of parameters, it returns the partitions for each feature. The goal + of this function is to parallelize the partitioning step (get_parts function). + + Args: + params: a list of tuples with three elements: 1) a tuple with the feature + index, the cluster index and the number of clusters (k), 2) the data for the + feature, and 3) a boolean indicating whether the feature is numerical or not. + + Returns: + A 2d array with the partitions (rows) for the selected features and number of + clusters. + """ + n_objects = params[0][1].shape[0] + parts = np.zeros((len(params), n_objects), dtype=np.int16) - 1 + + # iterate over a list of tuples that indicate a feature-k pair + for p_idx, p in enumerate(params): + # the first element is a tuple with the feature index, the cluster index and the + # number of clusters (k) + info = p[0] + # f_idx = info[0] + c_idx = info[1] + c = info[2] + range_n_clusters = np.array([c], dtype=np.uint16) + + # the second element is the data for the feature + data = p[1] + + # the third element is a boolean indicating whether the feature is numerical + numerical_data_type = p[2] + + # if the feature is categorical, then only the first partition is filled + if not numerical_data_type and c_idx > 0: + continue + + parts[p_idx] = get_parts(data, range_n_clusters, numerical_data_type) + + return parts + + +def cdist_parts_basic(x: NDArray, y: NDArray) -> NDArray[float]: + """ + It implements the same functionality in scipy.spatial.distance.cdist but + for clustering partitions, and instead of a distance it returns the adjusted + Rand index (ARI). In other words, it mimics this function call: + + cdist(x, y, metric=ari) + + Only partitions with positive labels (> 0) are compared. This means that + partitions marked as "singleton" or "empty" (categorical data) are not + compared. This has the effect of leaving an ARI of 0.0 (zero). + + Args: + x: a 2d array with m_x clustering partitions in rows and n objects in + columns. + y: a 2d array with m_y clustering partitions in rows and n objects in + columns. + + Returns: + A 2d array with m_x rows and m_y columns and the ARI between each + partition pair. Each ij entry is equal to ari(x[i], y[j]) for each i + and j. + """ + res = np.zeros((x.shape[0], y.shape[0])) + + for i in range(res.shape[0]): + if x[i, 0] < 0: + continue + + for j in range(res.shape[1]): + if y[j, 0] < 0: + continue + + res[i, j] = ari(x[i], y[j]) + + return res + + +def cdist_parts_parallel( + x: NDArray, y: NDArray, executor: ThreadPoolExecutor +) -> NDArray[float]: + """ + It parallelizes cdist_parts_basic function. + + Args: + x: same as in cdist_parts_basic + y: same as in cdist_parts_basic + executor: a pool executor where jobs will be submitted. + + Results: + Same as in cdist_parts_basic. + """ + res = np.zeros((x.shape[0], y.shape[0])) + + inputs = get_chunks(res.shape[0], executor._max_workers, 1) + + tasks = {executor.submit(cdist_parts_basic, x[idxs], y): idxs for idxs in inputs} + for t in as_completed(tasks): + idx = tasks[t] + res[idx, :] = t.result() + + return res + + +@njit(cache=True, nogil=True) +def get_coords_from_index(n_obj: int, idx: int) -> tuple[int]: + """ + Given the number of objects and an index, it returns the row/column + position of the pairwise matrix. For example, if there are n_obj objects + (such as genes), a condensed 1d array can be created with pairwise + comparisons between genes, as well as a squared symmetric matrix. This + function receives the number of objects and the index of the condensed + array, and returns the coordiates of the squared symmetric matrix. + + Args: + n_obj: the number of objects. + idx: the index of the condensed pairwise array across all n_obj objects. + + Returns + A tuple (i, j) with the coordinates of the squared symmetric matrix + equivalent to the condensed array. + """ + b = 1 - 2 * n_obj + x = np.floor((-b - np.sqrt(b**2 - 8 * idx)) / 2) + y = idx + x * (b + x + 2) / 2 + 1 + return int(x), int(y) + + +def get_chunks( + iterable: Union[int, Iterable], n_threads: int, ratio: float = 1 +) -> Iterable[Iterable[int]]: + """ + It splits elements in an iterable in chunks according to the number of + CPU cores available for parallel processing. + + Args: + iterable: an iterable to be split in chunks. If it is an integer, it + will split the iterable given by np.arange(iterable). + n_threads: number of threads available for parallelization. + ratio: a ratio that allows to increase the number of splits given + n_threads. For example, with ratio=1, the function will just split + the iterable in n_threads chunks. If ratio is larger than 1, then + it will split in n_threads * ratio chunks. + + Results: + Another iterable with chunks according to the arguments given. For + example, if iterable is [0, 1, 2, 3, 4, 5] and n_threads is 2, it will + return [[0, 1, 2], [3, 4, 5]]. + """ + if isinstance(iterable, int): + iterable = np.arange(iterable) + + n = len(iterable) + expected_n_chunks = n_threads * ratio + + res = list(chunker(iterable, int(np.ceil(n / expected_n_chunks)))) + + while len(res) < expected_n_chunks <= n: + # look for an element in res that can be split in two + idx = 0 + while len(res[idx]) == 1: + idx = idx + 1 + + new_chunk = get_chunks(res[idx], 2) + res[idx] = new_chunk[0] + res.insert(idx + 1, new_chunk[1]) + + return res + + +def get_feature_type_and_encode(feature_data: NDArray) -> tuple[NDArray, bool]: + """ + Given the data of one feature as a 1d numpy array (it could also be a pandas.Series), + it returns the same data if it is numerical (float, signed or unsigned integer) or an + encoded version if it is categorical (each category value has a unique integer starting from + zero). + + Args: + feature_data: a 1d array with data. + + Returns: + A tuple with two elements: + 1. the feature data: same as input if numerical, encoded version if not numerical. + 2. A boolean indicating whether the feature data is numerical or not. + """ + data_type_is_numerical = feature_data.dtype.kind in ("f", "i", "u") + if data_type_is_numerical: + return feature_data, data_type_is_numerical + + # here np.unique with return_inverse encodes categorical values into numerical ones + return np.unique(feature_data, return_inverse=True)[1], data_type_is_numerical + + +def compute_ccc(obj_parts_i: NDArray, obj_parts_j: NDArray, cdist_func): + """ + Given a set of partitions for two features, it computes the CCC coefficient. + + Args: + obj_parts_i: a 2d array with partitions for one feature. Each row is a + partition, and each column is an object. + obj_parts_j: a 2d array with partitions for another feature. Each row is + a partition, and each column is an object. + cdist_func: a function that computes the distance between partitions. It + can be either cdist_parts_basic or cdist_parts_parallel. + + Returns: + A tuple with two elements: 1) the CCC coefficient, and 2) the indexes + of the partitions that maximized the coefficient. + """ + comp_values = cdist_func( + obj_parts_i, + obj_parts_j, + ) + max_flat_idx = comp_values.argmax() + max_idx = unravel_index_2d(max_flat_idx, comp_values.shape) + + return max(comp_values[max_idx], 0.0), max_idx + + +def compute_ccc_perms(params) -> NDArray[float]: + """ + Similar to compute_ccc (with same parameters), but it computes the CCC coefficient + by permuting the partitions of one of the features n_perms times. + + Args: + params: a tuple with four elements: 1) the index of the permutations, 2) the + partitions of one of the features, 3) the partitions of the other feature, + and 4) the number of permutations to perform. + + Returns: + The CCC coefficient values using the permuted partitions of one of the features. + """ + # since this function can be parallelized across different processes, make sure + # the random number generator is initialized with a different seed for each process + rng = np.random.default_rng() + + _, obj_parts_i, obj_parts_j, n_perms = params + + n_objects = obj_parts_i.shape[1] + ccc_perm_values = np.full(n_perms, np.nan, dtype=float) + + for idx in range(n_perms): + perm_idx = rng.permutation(n_objects) + + # generate a random permutation of the partitions of one + # variable/feature + obj_parts_j_permuted = np.full_like(obj_parts_j, np.nan) + for it in range(obj_parts_j.shape[0]): + obj_parts_j_permuted[it] = obj_parts_j[it][perm_idx] + + # compute the CCC using the permuted partitions + ccc_perm_values[idx] = compute_ccc( + obj_parts_i, obj_parts_j_permuted, cdist_parts_basic + )[0] + + return ccc_perm_values + + +def compute_coef(params): + """ + Given a list of indexes representing each a pair of + objects/rows/genes, it computes the CCC coefficient for + each of them. This function is supposed to be used to parallelize + processing. + + Args: + params: a tuple with eight elements: 1) the indexes of the features + to compare, 2) the number of features, 3) the partitions for each + feature, 4) the number of permutations to compute the p-value, 5) + the number of threads to use for parallelization, 6) the ratio + between the number of chunks and the number of threads, 7) the + executor to use for cdist parallelization, and 8) the executor to use + for parallelization of permutations. + + Returns: + Returns a tuple with three arrays. The first array has the CCC + coefficients, the second array has the indexes of the partitions that + maximized the coefficient, and the third array has the p-values. + """ + ( + idx_list, + n_features, + parts, + pvalue_n_perms, + default_n_threads, + n_chunks_threads_ratio, + cdist_executor, + executor, + ) = params + + cdist_func = cdist_parts_basic + if cdist_executor is not False: + + def cdist_func(x, y): + return cdist_parts_parallel(x, y, cdist_executor) + + n_idxs = len(idx_list) + max_ari_list = np.full(n_idxs, np.nan, dtype=float) + max_part_idx_list = np.zeros((n_idxs, 2), dtype=np.uint64) + pvalues = np.full(n_idxs, np.nan, dtype=float) + + for idx, data_idx in enumerate(idx_list): + i, j = get_coords_from_index(n_features, data_idx) + + # get partitions for the pair of objects + obji_parts, objj_parts = parts[i], parts[j] + + # compute ari only if partitions are not marked as "missing" + # (negative values), which is assigned when partitions have + # one cluster (usually when all data in the feature has the same + # value). + if obji_parts[0, 0] == -2 or objj_parts[0, 0] == -2: + continue + + # compare all partitions of one object to the all the partitions + # of the other object, and get the maximium ARI + max_ari_list[idx], max_part_idx_list[idx] = compute_ccc( + obji_parts, objj_parts, cdist_func + ) + + # compute p-value if requested + if pvalue_n_perms is not None and pvalue_n_perms > 0: + # with ThreadPoolExecutor(max_workers=pvalue_n_jobs) as executor_perms: + # select the variable that generated more partitions as the one + # to permute + obj_parts_sel_i = obji_parts + obj_parts_sel_j = objj_parts + if (obji_parts[:, 0] >= 0).sum() > (objj_parts[:, 0] >= 0).sum(): + obj_parts_sel_i = objj_parts + obj_parts_sel_j = obji_parts + + p_ccc_values = np.full(pvalue_n_perms, np.nan, dtype=float) + p_inputs = get_chunks( + pvalue_n_perms, default_n_threads, n_chunks_threads_ratio + ) + p_inputs = [ + ( + i, + obj_parts_sel_i, + obj_parts_sel_j, + len(i), + ) + for i in p_inputs + ] + + for params, p_ccc_val in zip( + p_inputs, + executor.map( + compute_ccc_perms, + p_inputs, + ), + ): + p_idx = params[0] + + p_ccc_values[p_idx] = p_ccc_val + + # compute p-value + pvalues[idx] = (np.sum(p_ccc_values >= max_ari_list[idx]) + 1) / ( + pvalue_n_perms + 1 + ) + + return max_ari_list, max_part_idx_list, pvalues + + +def get_n_workers(n_jobs: int | None) -> int: + """ + Helper function to get the number of workers for parallel processing. + + Args: + n_jobs: value specified by the main ccc function. + Returns: + The number of workers to use for parallel processing + """ + n_cpu_cores = os.cpu_count() + if n_cpu_cores is None: + raise ValueError("Could not determine the number of CPU cores. Please specify a positive value of n_jobs") + + n_workers = n_cpu_cores + if n_jobs is None: + return n_workers + + n_workers = os.cpu_count() + n_jobs if n_jobs < 0 else n_jobs + + if n_workers < 1: + raise ValueError(f"The number of threads/processes to use must be greater than 0. Got {n_workers}." + "Please check the n_jobs argument provided") + + return n_workers + + +def ccc( + x: NDArray, + y: NDArray = None, + internal_n_clusters: Union[int, Iterable[int]] = None, + return_parts: bool = False, + n_chunks_threads_ratio: int = 1, + n_jobs: int = 1, + pvalue_n_perms: int = None, + partitioning_executor: str = "thread", +) -> tuple[NDArray[float], NDArray[float], NDArray[np.uint64], NDArray[np.int16]]: + """ + This is the main function that computes the Clustermatch Correlation + Coefficient (CCC) between two arrays. The implementation supports numerical + and categorical data. + + Args: + x: 1d or 2d numerical array with the data. NaN are not supported. + If it is 2d, then the coefficient is computed for each pair of rows + (in case x is a numpy.array) or each pair of columns (pandas.DataFrame). + y: an optional 1d numerical array. If x is 1d and y is given, it computes + the coefficient between x and y. + internal_n_clusters: this parameter can be an integer (the maximum number + of clusters used to split x and y, starting from k=2) or a list of + integer values (a custom list of k values). + return_parts: if True, for each object pair, it returns the partitions + that maximized the coefficient. + n_chunks_threads_ratio: allows to modify how pairwise comparisons are + split across different threads. It's given as the ratio parameter of + function get_chunks. + n_jobs: number of CPU cores/threads to use for parallelization. The value + None will use all available cores (`os.cpu_count()`), and negative + values will use `os.cpu_count() + n_jobs` (exception will be raised + if this expression yields a result less than 1). Default is 1. + pvalue_n_perms: if given, it computes the p-value of the + coefficient using the given number of permutations. + partitioning_executor: Executor type used for partitioning the data. It + can be either "thread" (default) or "process". If "thread", it will use + ThreadPoolExecutor for parallelization, which uses less memory. If + "process", it will use ProcessPoolExecutor, which might be faster. If + anything else, it will not parallelize the partitioning step. + + + Returns: + If returns_parts is True, then it returns a tuple with three values: + 1) the coefficients, 2) the partitions indexes that maximized the coefficient + for each object pair, and 3) the partitions for all objects. + If return_parts is False, only CCC values are returned. + + cm_values: if x is 2d np.array with x.shape[0] > 2, then cm_values is a 1d + condensed array of pairwise coefficients. It has size (n * (n - 1)) / 2, + where n is the number of rows in x. If x and y are given, and they are 1d, + then cm_values is a scalar. The CCC is always between 0 and 1 (inclusive). If + any of the two variables being compared has no variation (all values are the + same), the coefficient is not defined (np.nan). If pvalue_n_permutations is + an integer greater than 0, then cm_vlaues is a tuple with two elements: + the first element are the CCC values, and the second element are the p-values + using pvalue_n_permutations permutations. + + max_parts: an array with n * (n - 1)) / 2 rows (one for each object + pair) and two columns. It has the indexes pointing to each object's + partition (parts, see below) that maximized the ARI. If + cm_values[idx] is nan, then max_parts[idx] will be meaningless. + + parts: a 3d array that contains all the internal partitions generated + for each object in data. parts[i] has the partitions for object i, + whereas parts[i,j] has the partition j generated for object i. The + third dimension is the number of columns in x (if 2d) or elements in + x/y (if 1d). For example, if you want to access the pair of + partitions that maximized the CCC given x and y + (a pair of objects), then max_parts[0] and max_parts[1] have the + partition indexes in parts, respectively: parts[0][max_parts[0]] + points to the partition for x, and parts[1][max_parts[1]] points to + the partition for y. Values could be negative in case + singleton cases were found (-1; usually because input data has all the same + value) or for categorical features (-2). + """ + n_objects = None + n_features = None + # this is a boolean array of size n_features with True if the feature is numerical and False otherwise + X_numerical_type = None + if x.ndim == 1 and (y is not None and y.ndim == 1): + # both x and y are 1d arrays + if not x.shape == y.shape: + raise ValueError("x and y need to be of the same size") + n_objects = x.shape[0] + n_features = 2 + + X = np.zeros((n_features, n_objects)) + X_numerical_type = np.full((n_features,), True, dtype=bool) + + X[0, :], X_numerical_type[0] = get_feature_type_and_encode(x) + X[1, :], X_numerical_type[1] = get_feature_type_and_encode(y) + elif x.ndim == 2 and y is None: + # x is a 2d array; two things could happen: 1) this is an numpy array, + # in that case, features are in rows, objects are in columns; 2) or this is a + # pandas dataframe, which is the opposite (features in columns and objects in rows), + # plus we have the features data type (numerical, categorical, etc) + + if isinstance(x, np.ndarray): + if not get_feature_type_and_encode(x[0, :])[1]: + raise ValueError("If data is a 2d numpy array, it has to be numerical. Use pandas.DataFrame if " + "you need to mix features with different data types") + n_objects = x.shape[1] + n_features = x.shape[0] + + X = x + X_numerical_type = np.full((n_features,), True, dtype=bool) + elif hasattr(x, "to_numpy"): + # Here I assume that if x has the attribute "to_numpy" is of type pandas.DataFrame + # Using isinstance(x, pandas.DataFrame) would be more appropriate, but I dont want to + # have pandas as a dependency just for that + n_objects = x.shape[0] + n_features = x.shape[1] + + X = np.zeros((n_features, n_objects)) + X_numerical_type = np.full((n_features,), True, dtype=bool) + + for f_idx in range(n_features): + X[f_idx, :], X_numerical_type[f_idx] = get_feature_type_and_encode( + x.iloc[:, f_idx] + ) + else: + raise ValueError("Wrong combination of parameters x and y") + + # get number of cores to use + n_workers = get_n_workers(n_jobs) + + if internal_n_clusters is not None: + _tmp_list = List() + + if isinstance(internal_n_clusters, int): + # this interprets internal_n_clusters as the maximum k + internal_n_clusters = range(2, internal_n_clusters + 1) + + for x in internal_n_clusters: + _tmp_list.append(x) + internal_n_clusters = _tmp_list + + # get matrix of partitions for each object pair + range_n_clusters = get_range_n_clusters(n_objects, internal_n_clusters) + + if range_n_clusters.shape[0] == 0: + raise ValueError(f"Data has too few objects: {n_objects}") + + # store a set of partitions per row (object) in X as a multidimensional + # array, where the second dimension is the number of partitions per object. + parts = ( + np.zeros((n_features, range_n_clusters.shape[0], n_objects), dtype=np.int16) - 1 + ) + + # cm_values stores the CCC coefficients + n_features_comp = (n_features * (n_features - 1)) // 2 + cm_values = np.full(n_features_comp, np.nan) + cm_pvalues = np.full(n_features_comp, np.nan) + + # for each object pair being compared, max_parts has the indexes of the + # partitions that maximimized the ARI + max_parts = np.zeros((n_features_comp, 2), dtype=np.uint64) + + with ( + ThreadPoolExecutor(max_workers=n_workers) as executor, + ProcessPoolExecutor(max_workers=n_workers) as pexecutor, + ): + map_func = map + if n_workers > 1: + if partitioning_executor == "thread": + map_func = executor.map + elif partitioning_executor == "process": + map_func = pexecutor.map + + # pre-compute the internal partitions for each object in parallel + + # first, create a list with features-k pairs that will be used to parallelize + # the partitioning step + inputs = get_chunks( + [ + (f_idx, c_idx, c) + for f_idx in range(n_features) + for c_idx, c in enumerate(range_n_clusters) + ], + n_workers, + n_chunks_threads_ratio, + ) + + # then, flatten the list of features-k pairs into a list that is divided into + # chunks that will be used to parallelize the partitioning step. + inputs = [ + [ + ( + feature_k_pair, + X[feature_k_pair[0]], + X_numerical_type[feature_k_pair[0]], + ) + for feature_k_pair in chunk + ] + for chunk in inputs + ] + + for params, ps in zip(inputs, map_func(get_feature_parts, inputs)): + # get the set of feature indexes and cluster indexes + f_idxs = [p[0][0] for p in params] + c_idxs = [p[0][1] for p in params] + + # update the partitions for each feature-k pair + parts[f_idxs, c_idxs] = ps + + # Below, there are two layers of parallelism: 1) parallel execution + # across feature pairs and 2) the cdist_parts_parallel function, which + # also runs several threads to compare partitions using ari. In 2) we + # need to disable parallelization in case len(cm_values) > 1 (that is, + # we have several feature pairs to compare), because parallelization is + # already performed at this level. Otherwise, more threads than + # specified by the user are started. + map_func = map + cdist_executor = False + inner_executor = DummyExecutor() + + if n_workers > 1: + if n_features_comp == 1: + map_func = map + cdist_executor = executor + inner_executor = pexecutor + + else: + map_func = pexecutor.map + + # iterate over all chunks of object pairs and compute the coefficient + inputs = get_chunks(n_features_comp, n_workers, n_chunks_threads_ratio) + inputs = [ + ( + i, + n_features, + parts, + pvalue_n_perms, + n_workers, + n_chunks_threads_ratio, + cdist_executor, + inner_executor, + ) + for i in inputs + ] + + for params, (max_ari_list, max_part_idx_list, pvalues) in zip( + inputs, map_func(compute_coef, inputs) + ): + f_idx = params[0] + + cm_values[f_idx] = max_ari_list + max_parts[f_idx, :] = max_part_idx_list + cm_pvalues[f_idx] = pvalues + + # return an array of values or a single scalar, depending on the input data + if cm_values.shape[0] == 1: + if return_parts: + if pvalue_n_perms is not None and pvalue_n_perms > 0: + return (cm_values[0], cm_pvalues[0]), max_parts[0], parts + else: + return cm_values[0], max_parts[0], parts + else: + if pvalue_n_perms is not None and pvalue_n_perms > 0: + return cm_values[0], cm_pvalues[0] + else: + return cm_values[0] + + if return_parts: + if pvalue_n_perms is not None and pvalue_n_perms > 0: + return (cm_values, cm_pvalues), max_parts, parts + else: + return cm_values, max_parts, parts + else: + if pvalue_n_perms is not None and pvalue_n_perms > 0: + return cm_values, cm_pvalues + else: + return cm_values diff --git a/libs/ccc/coef/impl_gpu_old.py b/libs/ccc/coef/impl_gpu_old.py new file mode 100644 index 00000000..ba14a75e --- /dev/null +++ b/libs/ccc/coef/impl_gpu_old.py @@ -0,0 +1,736 @@ +""" +This module contains the CUDA implementation of the CCC +""" +import math +import os +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Optional, Iterable, Union, List, Tuple + +import numpy as np +import cupy as cp +from numpy.typing import NDArray +from numba import njit +from cuml.metrics import adjusted_rand_score as cu_rnd_sc +from numba import cuda +from fractions import Fraction + +from ccc.pytorch.core import unravel_index_2d +from ccc.scipy.stats import rank +from ccc.sklearn.metrics import adjusted_rand_index as ari +from ccc.utils import chunker + + +# @njit(cache=True, nogil=True) +def get_perc_from_k(k: int) -> NDArray[np.float64]: + """ + It returns the percentiles (from 0.0 to 1.0) that separate the data into k + clusters. For example, if k=2, it returns [0.5]; if k=4, it returns [0.25, + 0.50, 0.75]. + + Args: + k: number of clusters. If less than 2, the function returns an empty + list. + + Returns: + A numpy array of percentiles (from 0.0 to 1.0). + """ + np.set_printoptions(precision=17) + if k < 2: + return np.array([], dtype='float64') + return np.linspace(1 / k, 1 - 1 / k, k - 1, dtype='float64') + + +# @njit(cache=True, nogil=True) +def get_range_n_percentages(ks: NDArray[np.uint8], as_percentage: bool = False) -> NDArray[float]: + """ + It returns lists of the percentiles (from 0.0 to 1.0) that separate the data into k[i] clusters + + Args: + ks: an array of numbers of clusters. + + Returns: + A 2D sparse matrix of percentiles (from 0.0 to 1.0). + """ + # Todo: research on if numba can optimize this + # Emtpy & null check + if ks.size == 0: + return np.empty((0, 0), dtype=float) + # Number of rows of the returning matrix + n_rows = len(ks) + # Number of columns of the returning matrix, dominated by the largest k, which specifies the # of clusters + n_cols = np.max(ks) - 1 + percentiles = np.full((n_rows, n_cols), np.nan, dtype=float) + for idx, k in enumerate(ks): + perc = get_perc_from_k(k) + if as_percentage: + perc = np.round(perc * 100).astype(float) # Convert to percentage and round + percentiles[idx, :len(perc)] = perc + return percentiles + + +def get_feature_type_and_encode(feature_data: NDArray) -> tuple[NDArray, bool]: + """ + Given the data of one feature as a 1d numpy array (it could also be a pandas.Series), + it returns the same data if it is numerical (float, signed or unsigned integer) or an + encoded version if it is categorical (each category value has a unique integer starting from + zero).` f + + Args: + feature_data: a 1d array with data. + + Returns: + A tuple with two elements: + 1. the feature data: same as input if numerical, encoded version if not numerical. + 2. A boolean indicating whether the feature data is numerical or not. + """ + data_type_is_numerical = feature_data.dtype.kind in ("f", "i", "u") + if data_type_is_numerical: + return feature_data, data_type_is_numerical + + # here np.unique with return_inverse encodes categorical values into numerical ones + return np.unique(feature_data, return_inverse=True)[1], data_type_is_numerical + + +# @njit(cache=True, nogil=True) +def get_range_n_clusters( + n_items: int, internal_n_clusters: Iterable[int] = None +) -> NDArray[np.uint8]: + """ + Given the number of features it returns a tuple of k values to cluster those + features into. By default, it generates a tuple of k values from 2 to + int(np.round(np.sqrt(n_items))) (inclusive). For example, for 25 features, + it will generate this array: (2, 3, 4, 5). + + Args: + n_items: a positive number representing the number of features that + will be clustered into different groups/clusters. + internal_n_clusters: it allows to force a different list of clusters. It + must be a list of integers. Repeated or invalid values will be dropped, + such as values lesser than 2 (a singleton partition is not allowed). + + Returns: + A numpy array with integer values representing numbers of clusters. + """ + + if internal_n_clusters: + # remove k values that are invalid + clusters_range_list = list( + set([int(x) for x in internal_n_clusters if 1 < x < n_items]) + ) + else: + # default behavior if no internal_n_clusters is given: return range from + # 2 to sqrt(n_items) + n_sqrt = int(np.round(np.sqrt(n_items))) + n_sqrt = min((n_sqrt, 10)) + clusters_range_list = list(range(2, n_sqrt + 1)) + + return np.array(clusters_range_list, dtype=np.uint16) + + +# # Todo: restore the original implementation +# @cuda.jit(device=True) +# def get_coords_from_index(n_obj: int, idx: int) -> tuple[int]: +# """ +# Given the number of objects and an index, it returns the row/column +# position of the pairwise matrix. For example, if there are n_obj objects +# (such as genes), a condensed 1d array can be created with pairwise +# comparisons between genes, as well as a squared symmetric matrix. This +# function receives the number of objects and the index of the condensed +# array, and returns the coordinates of the squared symmetric matrix. +# Args: +# n_obj: the number of objects. +# idx: the index of the condensed pairwise array across all n_obj objects. +# Returns +# A tuple (i, j) with the coordinates of the squared symmetric matrix +# equivalent to the condensed array. +# """ +# b = 1 - 2 * n_obj +# x = math.floor((-b - math.sqrt(b**2 - 8 * idx)) / 2) +# y = idx + x * (b + x + 2) / 2 + 1 +# return int(x), int(y) + + +@njit(cache=True, nogil=True) +def get_coords_from_index(n_obj: int, idx: int) -> tuple[int]: + """ + Given the number of objects and and index, it returns the row/column + position of the pairwise matrix. For example, if there are n_obj objects + (such as genes), a condensed 1d array can be created with pairwise + comparisons between genes, as well as a squared symmetric matrix. This + function receives the number of objects and the index of the condensed + array, and returns the coordiates of the squared symmetric matrix. + + Args: + n_obj: the number of objects. + idx: the index of the condensed pairwise array across all n_obj objects. + + Returns + A tuple (i, j) with the coordinates of the squared symmetric matrix + equivalent to the condensed array. + """ + b = 1 - 2 * n_obj + x = np.floor((-b - np.sqrt(b ** 2 - 8 * idx)) / 2) + y = idx + x * (b + x + 2) / 2 + 1 + return int(x), int(y) + + +def convert_n_clusters(internal_n_clusters: Optional[Union[int, List[int]]]) -> List[int]: + if internal_n_clusters is None: + return [] + + if isinstance(internal_n_clusters, int): + return list(range(2, internal_n_clusters + 1)) + + return list(internal_n_clusters) + + +def get_parts(X: NDArray, + range_n_clusters: NDArray[np.uint8], + data_is_numerical: bool = True + ) -> tuple[cp.ndarray, cp.ndarray]: + """ + Compute parts using CuPy for GPU acceleration. + + Parameters: + X: Input data array of shape (n_features, n_objects) + range_n_clusters: Array of cluster numbers + range_n_percentages: Array of percentages for each cluster number + + Returns: + Reference to the computed partitions on the device global memory + """ + + # Handle case when X is a 1D array + if X.ndim == 1: + nx = 1 # n_features + ny = range_n_clusters.shape[0] # n_clusters + nz = X.shape[0] # n_objects + else: + nx = X.shape[0] # n_features + ny = range_n_clusters.shape[0] # n_clusters + nz = X.shape[1] # n_objects + # print(f"{nx}, {ny}, {nz}") + + # Allocate arrays on device global memory + d_parts = cp.empty((nx, ny, nz), dtype=np.int16) - 1 + d_unique_elem_counts = cp.empty((nx, ny), dtype=np.int16) - 1 + # print(f"prev parts: {d_parts}") + + if data_is_numerical: + # Transfer data to device + d_X = cp.asarray(X) + # Get cutting percentages for each cluster + range_n_percentages = get_range_n_percentages(range_n_clusters) + d_range_n_percentages = cp.asarray(range_n_percentages, dtype=float) + + for x in range(nx): + for y in range(ny): + objects = d_X[x, :] if d_X.ndim == 2 else d_X # objects in a feature row + # Todo: use cupy fusion to optimize the two operations below + percentages = d_range_n_percentages[y, :] + # print(f"GPU percentiles: {percentages}") + bins = cp.quantile(objects, percentages) + # print(f"GPU quantiles: {bins}") + partition = cp.digitize(objects, bins, right=True) + d_parts[x, y, :] = partition + # Count number of unique elements in each partition, used in the ARI computation + d_unique_elem_counts[x, y] = len(cp.unique(partition)) + + # Remove singletons by putting -2 as values + partitions_ks = cp.array([len(cp.unique(d_parts[i, j, :])) for i in range(nx) for j in range(ny)]).reshape(nx, + ny) + d_parts[partitions_ks == 1] = -2 + else: + # If the data is categorical, then the encoded feature is already the partition + # Only the first partition is filled, the rest will be -1 (missing) + # Todo: fix this to handle categorical data + d_parts[:, 0] = cp.asarray(X.astype(cp.int16)) + + # Move data back to host + # h_parts = cp.asnumpy(d_parts) + # print(f"after parts: {d_parts}") + cp.cuda.runtime.deviceSynchronize() + return d_parts, d_unique_elem_counts + + +# # Todo: kernel on partition paris (1D, 1D) instead of paris of matrices (2D, 2D) +# @cuda.jit(device=True) +# def cdist_parts_basic(x: NDArray, y: NDArray, out: NDArray, compare_pair_id: int) -> None: +# """ +# It implements the same functionality in scipy.spatial.distance.cdist but +# for clustering partitions, and instead of a distance it returns the adjusted +# Rand index (ARI). In other words, it mimics this function call: +# +# cdist(x, y, metric=ari) +# +# Only partitions with positive labels (> 0) are compared. This means that +# partitions marked as "singleton" or "empty" (categorical data) are not +# compared. This has the effect of leaving an ARI of 0.0 (zero). +# +# Args: +# x: a 2d array with m_x clustering partitions in rows and n objects in +# columns. +# y: a 2d array with m_y clustering partitions in rows and n objects in +# columns. +# +# Returns: +# A 2d array with m_x rows and m_y columns and the ARI between each +# partition pair. Each ij entry is equal to ari(x[i], y[j]) for each i +# and j. +# """ +# +# for i in range(out.shape[0]): +# if x[i, 0] < 0: +# continue +# +# for j in range(out.shape[1]): +# if y[j, 0] < 0: +# continue +# +# # res[i, j] = ari(x[i], y[j]) +# # ari(x[i], y[j], out, compare_pair_id, i, j) +# res = ari(x[i], y[j]) +# print(res) +# +# return +# +# +# @cuda.jit +# def compute_coef( +# parts: cuda.cudadrv.devicearray, +# max_ari_list: cuda.cudadrv.devicearray, +# max_part_idx_list: cuda.cudadrv.devicearray, +# temp_outs: cuda.cudadrv.devicearray, +# compare_pair_id: int, +# ): +# """ +# Given an index representing each a pair of +# objects/rows/genes, it computes the CCC coefficient for +# each of them. +# +# Args: +# parts: A reference to the 3d GPU partitions array. +# max_part_idx_list: A reference to the 2d GPU array that stores the indexes of the partitions that maximized the ARI. +# max_ari_list: A reference to the 1d GPU array that stores the maximum ARI values. +# compare_pair_id: An id representing a pair of partitions to be compared. +# +# Returns: +# Returns a tuple with two arrays. These two arrays are the same +# arrays returned by the main cm function (cm_values and +# max_parts) but for a subset of the data. +# """ +# n_features = parts.shape[0] +# +# # for idx, data_idx in enumerate(compare_pair_id): +# i, j = get_coords_from_index(n_features, compare_pair_id) +# +# # get partitions for the pair of objects +# obji_parts, objj_parts = parts[i], parts[j] +# +# # compute ari only if partitions are not marked as "missing" +# # (negative values), which is assigned when partitions have +# # one cluster (usually when all data in the feature has the same +# # value). +# if obji_parts[0, 0] == -2 or objj_parts[0, 0] == -2: +# return +# +# # compare all partitions of one object to the all the partitions +# # of the other object, and get the maximium ARI +# +# cdist_parts_basic( +# obji_parts, +# objj_parts, +# temp_outs, +# compare_pair_id, +# ) +# # max_flat_idx = comp_values.argmax() +# +# # max_idx = unravel_index_2d(max_flat_idx, comp_values.shape) +# # max_part_idx_list[compare_pair_id] = max_idx +# # max_ari_list[compare_pair_id] = np.max((comp_values[max_idx], 0.0)) +# # +# # return max_ari_list, max_part_idx_list +# return + +def get_chunks( + iterable: Union[int, Iterable], n_threads: int, ratio: float = 1 +) -> Iterable[Iterable[int]]: + """ + It splits elements in an iterable in chunks according to the number of + CPU cores available for parallel processing. + + Args: + iterable: an iterable to be split in chunks. If it is an integer, it + will split the iterable given by np.arange(iterable). + n_threads: number of threads available for parallelization. + ratio: a ratio that allows to increase the number of splits given + n_threads. For example, with ratio=1, the function will just split + the iterable in n_threads chunks. If ratio is larger than 1, then + it will split in n_threads * ratio chunks. + + Results: + Another iterable with chunks according to the arguments given. For + example, if iterable is [0, 1, 2, 3, 4, 5] and n_threads is 2, it will + return [[0, 1, 2], [3, 4, 5]]. + """ + if isinstance(iterable, int): + iterable = np.arange(iterable) + + n = len(iterable) + expected_n_chunks = n_threads * ratio + + res = list(chunker(iterable, int(np.ceil(n / expected_n_chunks)))) + + while len(res) < expected_n_chunks <= n: + # look for an element in res that can be split in two + idx = 0 + while len(res[idx]) == 1: + idx = idx + 1 + # Got two chunks + new_chunk = get_chunks(res[idx], 2) + res[idx] = new_chunk[0] + # Insert the second chunk in the next position + res.insert(idx + 1, new_chunk[1]) + + return res + + +def cdist_parts_basic(x: NDArray, y: NDArray) -> NDArray[float]: + """ + It implements the same functionality in scipy.spatial.distance.cdist but + for clustering partitions, and instead of a distance it returns the adjusted + Rand index (ARI). In other words, it mimics this function call: + + cdist(x, y, metric=ari) + + Only partitions with positive labels (> 0) are compared. This means that + partitions marked as "singleton" or "empty" (categorical data) are not + compared. This has the effect of leaving an ARI of 0.0 (zero). + + Args: + x: a 2d array with m_x clustering partitions in rows and n objects in + columns. + y: a 2d array with m_y clustering partitions in rows and n objects in + columns. + + Returns: + A 2d array with m_x rows and m_y columns and the ARI between each + partition pair. Each ij entry is equal to ari(x[i], y[j]) for each i + and j. + """ + res = np.zeros((x.shape[0], y.shape[0])) + + for i in range(res.shape[0]): + if x[i, 0] < 0: + continue + + for j in range(res.shape[1]): + if y[j, 0] < 0: + continue + + res[i, j] = cu_rnd_sc(x[i], y[j]) + # res[i, j] = ari(x[i], y[j]) + + return res + + +def cdist_parts_parallel( + x: NDArray, y: NDArray, executor: ThreadPoolExecutor +) -> NDArray[float]: + """ + It parallelizes cdist_parts_basic function. + + Args: + x: same as in cdist_parts_basic + y: same as in cdist_parts_basic + executor: an pool executor where jobs will be submitted. + + Results: + Same as in cdist_parts_basic. + """ + res = np.zeros((x.shape[0], y.shape[0])) + + inputs = list(chunker(np.arange(res.shape[0]), 1)) + + tasks = {executor.submit(cdist_parts_basic, x[idxs], y): idxs for idxs in inputs} + for t in as_completed(tasks): + idx = tasks[t] + res[idx, :] = t.result() + + return res + + +def ccc( + x: NDArray, + y: NDArray = None, + internal_n_clusters: Union[int, Iterable[int]] = None, + return_parts: bool = False, + n_chunks_threads_ratio: int = 1, + n_jobs: int = 1, +) -> tuple[NDArray[float], NDArray[np.uint64], NDArray[np.int16]]: + """ + This is the main function that computes the Clustermatch Correlation + Coefficient (CCC) between two arrays. The implementation supports numerical + and categorical data. + + Args: + x: an 1d or 2d numerical array with the data. NaN are not supported. + If it is 2d, then the coefficient is computed for each pair of rows + (in case x is a numpy.array) or each pair of columns (pandas.DataFrame). + y: an optional 1d numerical array. If x is 1d and y is given, it computes + the coefficient between x and y. + internal_n_clusters: this parameter can be an integer (the maximum number + of clusters used to split x and y, starting from k=2) or a list of + integer values (a custom list of k values). + return_parts: if True, for each object pair, it returns the partitions + that maximized the coefficient. + n_chunks_threads_ratio: allows to modify how pairwise comparisons are + split across different threads. It's given as the ratio parameter of + function get_chunks. + n_jobs: number of CPU cores to use for parallelization. The value + None will use all available cores (`os.cpu_count()`), and negative + values will use `os.cpu_count() - n_jobs`. Default is 1. + + Returns: + If return_parts is False, only CCC values are returned. + In that case, if x is 2d, a np.ndarray of size n x n is + returned with the coefficient values, where n is the number of rows in x. + If only a single coefficient was computed (for example, x and y were + given as 1d arrays each), then a single scalar is returned. + + If returns_parts is True, then it returns a tuple with three values: + 1) the + coefficients, 2) the partitions indexes that maximized the coefficient + for each object pair, and 3) the partitions for all objects. + + cm_values: if x is 2d, then it is a 1d condensed array of pairwise + coefficients. It has size (n * (n - 1)) / 2, where n is the number + of rows in x. If x and y are given, and they are 1d, then this is a + scalar. The CCC is always between 0 and 1 + (inclusive). If any of the two variables being compared has no + variation (all values are the same), the coefficient is not defined + (np.nan). + + max_parts: an array with n * (n - 1)) / 2 rows (one for each object + pair) and two columns. It has the indexes pointing to each object's + partition (parts, see below) that maximized the ARI. If + cm_values[idx] is nan, then max_parts[idx] will be meaningless. + + parts: a 3d array that contains all the internal partitions generated + for each object in data. parts[i] has the partitions for object i, + whereas parts[i,j] has the partition j generated for object i. The + third dimension is the number of columns in x (if 2d) or elements in + x/y (if 1d). For example, if you want to access the pair of + partitions that maximized the CCC given x and y + (a pair of objects), then max_parts[0] and max_parts[1] have the + partition indexes in parts, respectively: parts[0][max_parts[0]] + points to the partition for x, and parts[1][max_parts[1]] points to + the partition for y. Values could be negative in case + singleton cases were found (-1; usually because input data has all the same + value) or for categorical features (-2). + """ + n_objects = None + n_features = None + # this is a boolean array of size n_features with True if the feature is numerical and False otherwise + X_numerical_type = None + if x.ndim == 1 and (y is not None and y.ndim == 1): + # both x and y are 1d arrays + if not x.shape == y.shape: + raise ValueError("x and y need to be of the same size") + n_objects = x.shape[0] + n_features = 2 + # Create a matrix to store both x and y + X = np.zeros((n_features, n_objects)) + X_numerical_type = np.full((n_features,), True, dtype=bool) + + X[0, :], X_numerical_type[0] = get_feature_type_and_encode(x) + X[1, :], X_numerical_type[1] = get_feature_type_and_encode(y) + elif x.ndim == 2 and y is None: + # x is a 2d array; two things could happen: 1) this is an numpy array, + # in that case, features are in rows, objects are in columns; 2) or this is a + # pandas dataframe, which is the opposite (features in columns and objects in rows), + # plus we have the features data type (numerical, categorical, etc) + + if isinstance(x, np.ndarray): + assert get_feature_type_and_encode(x[0, :])[1], ( + "If data is a 2d numpy array, it has to be numerical. Use pandas.DataFrame if " + "you need to mix features with different data types" + ) + n_objects = x.shape[1] + n_features = x.shape[0] + + X = x + X_numerical_type = np.full((n_features,), True, dtype=bool) + elif hasattr(x, "to_numpy"): + # Here I assume that if x has the attribute "to_numpy" is of type pandas.DataFrame + # Using isinstance(x, pandas.DataFrame) would be more appropriate, but I dont want to + # have pandas as a dependency just for that + n_objects = x.shape[0] + n_features = x.shape[1] + + X = np.zeros((n_features, n_objects)) + X_numerical_type = np.full((n_features,), True, dtype=bool) + + for idx in range(n_features): + X[idx, :], X_numerical_type[idx] = get_feature_type_and_encode( + x.iloc[:, idx] + ) + else: + raise ValueError("Wrong combination of parameters x and y") + + # 1. Partitions Computation + + # Converts internal_n_clusters to a list of integers if it's provided. + internal_n_clusters = convert_n_clusters(internal_n_clusters) + + # Get matrix of partitions for each object pair + range_n_clusters = get_range_n_clusters(n_objects, internal_n_clusters) + + if range_n_clusters.shape[0] == 0: + raise ValueError(f"Data has too few objects: {n_objects}") + + # cm_values stores the CCC coefficients + n_features_comp = (n_features * (n_features - 1)) // 2 + cm_values = np.full(n_features_comp, np.nan) + + # for each object pair being compared, max_parts has the indexes of the + # partitions that maximimized the ARI + max_parts = np.zeros((n_features_comp, 2), dtype=np.uint64) + + # X here (and following) is a numpy array features are in rows, objects are in columns + + # Compute partitions for each feature using CuPy + d_parts, d_uniq_ele_counts = get_parts(X, range_n_clusters) + # used in the ARI computation later + n_parts = range_n_clusters.shape[0] + # d_parts_max_per_part = cp.empty(n_features * n_parts, dtype=np.int8) + d_parts_max_per_part = cp.amax(d_parts, axis=2) + print("GPU parts:") + print(d_parts) + print(f"Max per part: {d_parts_max_per_part}") + + # 2. CCC coefficient computation + + # # allocate result arrays on device global memory + # d_max_ari_list = cp.full(n_features_comp, cp.nan, dtype=float) + # d_max_part_idx_list = cp.zeros((n_features_comp, 2), dtype=np.uint64) + # # allocate temporary arrays on device global memory + # d_outs = cp.empty((n_features_comp, range_n_clusters.shape[0], range_n_clusters.shape[0]), dtype=cp.float32) + # print(f"before d_outs: {d_outs}") + # # use 1D gird to parallelize the computation of CCC coefficients + # # Todo: optimize this using updated c_dist function that only compare one partition at a time + # threads_per_block = 1 + # blocks_per_grid = n_features_comp + # for i in range(n_features_comp): + # # Directly pass CuPy arrays to kernels JITed with Numba + # compute_coef[blocks_per_grid, threads_per_block](d_parts, d_max_ari_list, d_max_part_idx_list, d_outs, i) + # # Wait for all comparisons to finish + # cuda.synchronize() + # print(f"after d_outs: {d_outs}") + # # Transfer data back to host + # max_ari_list = cp.asnumpy(d_max_ari_list) + # max_part_idx_list = cp.asnumpy(d_max_part_idx_list) + # print(max_ari_list) + # print(max_part_idx_list) + + # Use CPU multi-threading for baseline + parts = cp.asnumpy(d_parts) + + default_n_threads = os.cpu_count() + + with ThreadPoolExecutor(max_workers=default_n_threads) as executor: + + # Below, there are two layers of parallelism: 1) parallel execution + # across feature pairs and 2) the cdist_parts_parallel function, which + # also runs several threads to compare partitions using ari. In 2) we + # need to disable parallelization in case len(cm_values) > 1 (that is, + # we have several feature pairs to compare), because parallelization is + # already performed at this level. Otherwise, more threads than + # specified by the user are started. + cdist_parts_enable_threading = True if n_features_comp == 1 else False + + cdist_func = None + map_func = executor.map + if cdist_parts_enable_threading: + map_func = map + + def cdist_func(x, y): + return cdist_parts_parallel(x, y, executor) + + else: + cdist_func = cdist_parts_basic + + # compute coefficients + def compute_coef(idx_list): + """ + Given a list of indexes representing each a pair of + objects/rows/genes, it computes the CCC coefficient for + each of them. This function is supposed to be used to parallelize + processing. + + Args: + idx_list: a list of indexes (integers), each of them + representing a pair of objects. + + Returns: + Returns a tuple with two arrays. These two arrays are the same + arrays returned by the main cm function (cm_values and + max_parts) but for a subset of the data. + """ + n_idxs = len(idx_list) + max_ari_list = np.full(n_idxs, np.nan, dtype=float) + max_part_idx_list = np.zeros((n_idxs, 2), dtype=np.uint64) + + for idx, data_idx in enumerate(idx_list): + i, j = get_coords_from_index(n_features, data_idx) + + # get partitions for the pair of objects + obji_parts, objj_parts = parts[i], parts[j] + + # compute ari only if partitions are not marked as "missing" + # (negative values), which is assigned when partitions have + # one cluster (usually when all data in the feature has the same + # value). + if obji_parts[0, 0] == -2 or objj_parts[0, 0] == -2: + continue + + # compare all partitions of one object to the all the partitions + # of the other object, and get the maximium ARI + comp_values = cdist_func( + obji_parts, + objj_parts, + ) + max_flat_idx = comp_values.argmax() + + max_idx = unravel_index_2d(max_flat_idx, comp_values.shape) + max_part_idx_list[idx] = max_idx + max_ari_list[idx] = np.max((comp_values[max_idx], 0.0)) + + return max_ari_list, max_part_idx_list + + # iterate over all chunks of object pairs and compute the coefficient + inputs = get_chunks(n_features_comp, default_n_threads, n_chunks_threads_ratio) + + for idx, (max_ari_list, max_part_idx_list) in zip( + inputs, map_func(compute_coef, inputs) + ): + cm_values[idx] = max_ari_list + max_parts[idx, :] = max_part_idx_list + + # return an array of values or a single scalar, depending on the input data + if cm_values.shape[0] == 1: + if return_parts: + return cm_values[0], max_parts[0], parts + else: + return cm_values[0] + + if return_parts: + return cm_values, max_parts, parts + else: + return cm_values + +# Dev notes +# 1. parallelize get_parst +# 1.1 gpu percentile computation +# 1.1 gpu data points binning +# can be a kernel for-loop to compute parts on different percentile \ No newline at end of file diff --git a/libs/ccc/pyproject.toml b/libs/ccc/pyproject.toml new file mode 100644 index 00000000..e20fc448 --- /dev/null +++ b/libs/ccc/pyproject.toml @@ -0,0 +1,88 @@ +[build-system] +requires = ["scikit-build-core>=0.10", "pybind11"] +build-backend = "scikit_build_core.build" + + +[project] +name = "ccc-coef" +version = "0.0.1" +description="A minimal example package (with pybind11)" +readme = "README.md" +authors = [ + { name = "My Name", email = "me@email.com" }, +] +requires-python = ">=3.9" +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] + +[project.optional-dependencies] +test = ["pytest"] + + +[tool.scikit-build] +wheel.expand-macos-universal-tags = true +minimum-version = "build-system.requires" + + +[tool.pytest.ini_options] +minversion = "8.0" +addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"] +xfail_strict = true +log_cli_level = "INFO" +filterwarnings = [ + "error", + "ignore::pytest.PytestCacheWarning", +] +testpaths = ["tests"] + + +[tool.cibuildwheel] +build-frontend = "build[uv]" +test-command = "pytest {project}/tests" +test-extras = ["test"] + +[tool.cibuildwheel.pyodide] +build-frontend = {name = "build", args = ["--exports", "whole_archive"]} + +[tool.ruff.lint] +extend-select = [ + "B", # flake8-bugbear + "I", # isort + "ARG", # flake8-unused-arguments + "C4", # flake8-comprehensions + "EM", # flake8-errmsg + "ICN", # flake8-import-conventions + "G", # flake8-logging-format + "PGH", # pygrep-hooks + "PIE", # flake8-pie + "PL", # pylint + "PT", # flake8-pytest-style + "PTH", # flake8-use-pathlib + "RET", # flake8-return + "RUF", # Ruff-specific + "SIM", # flake8-simplify + "T20", # flake8-print + "UP", # pyupgrade + "YTT", # flake8-2020 + "EXE", # flake8-executable + "NPY", # NumPy specific rules + "PD", # pandas-vet +] +ignore = [ + "PLR09", # Too many X + "PLR2004", # Magic comparison +] +isort.required-imports = ["from __future__ import annotations"] + +[tool.ruff.lint.per-file-ignores] +"tests/**" = ["T20"] \ No newline at end of file diff --git a/libs/ccc/sklearn/metrics.py b/libs/ccc/sklearn/metrics.py index 387b31bb..e045f077 100644 --- a/libs/ccc/sklearn/metrics.py +++ b/libs/ccc/sklearn/metrics.py @@ -69,7 +69,7 @@ def get_contingency_matrix(part0: np.ndarray, part1: np.ndarray) -> np.ndarray: return cont_mat -@njit(cache=True, nogil=True) +# @njit(cache=True, nogil=True) def get_pair_confusion_matrix(part0: np.ndarray, part1: np.ndarray) -> np.ndarray: """ Returns the pair confusion matrix from two clustering partitions. It is an @@ -93,9 +93,15 @@ def get_pair_confusion_matrix(part0: np.ndarray, part1: np.ndarray) -> np.ndarra # Computation using the contingency data contingency = get_contingency_matrix(part0, part1) - n_c = np.ravel(contingency.sum(axis=1)) - n_k = np.ravel(contingency.sum(axis=0)) + # print(f"py contingency:\n {contingency}") + sum1 = contingency.sum(axis=1) + sum0 = contingency.sum(axis=0) + n_c = np.ravel(sum1) + # print(f"py sum_row: {n_c}") + n_k = np.ravel(sum0) + # print(f"py sum_col: {n_k}") sum_squares = (contingency**2).sum() + # print(f"py sum_squares: {sum_squares}") C = np.empty((2, 2), dtype=np.int64) C[1, 1] = sum_squares - n_samples C[0, 1] = contingency.dot(n_k).sum() - sum_squares diff --git a/libs/ccc/sklearn/metrics_gpu.py b/libs/ccc/sklearn/metrics_gpu.py new file mode 100644 index 00000000..c460ef32 --- /dev/null +++ b/libs/ccc/sklearn/metrics_gpu.py @@ -0,0 +1,347 @@ +import numpy as np +import cupy as cp +from numba import njit +from numba import cuda +import rmm + + +d_get_confusion_matrix_str = """ +/** + * @brief CUDA device function to compute the pair confusion matrix + * @param[in] contingency Pointer to the contingency matrix + * @param[in] sum_rows Pointer to the sum of rows in the contingency matrix + * @param[in] sum_cols Pointer to the sum of columns in the contingency matrix + * @param[in] n_objs Number of objects in each partition + * @param[in] k Number of clusters (assuming k is the max of clusters in part0 and part1) + * @param[out] C Pointer to the output pair confusion matrix (2x2) + */ +__device__ void get_pair_confusion_matrix( + const int* __restrict__ contingency, + int * sum_rows, + int * sum_cols, + const int n_objs, + const int k, + int* C +) { + // Initialize sum_rows and sum_cols + for (int i = threadIdx.x; i < k; i += blockDim.x) { + sum_rows[i] = 0; + sum_cols[i] = 0; + } + __syncthreads(); + + // Compute sum_rows and sum_cols + for (int i = threadIdx.x; i < k * k; i += blockDim.x) { + int row = i / k; + int col = i % k; + int val = contingency[i]; + atomicAdd(&sum_cols[col], val); + atomicAdd(&sum_rows[row], val); + } + __syncthreads(); + + // Compute sum_squares + int sum_squares; + if (threadIdx.x == 0) { + sum_squares = 0; + for (int i = 0; i < k * k; ++i) { + sum_squares += (contingency[i]) * contingency[i]; + } + } + __syncthreads(); + + // Compute C[1,1], C[0,1], C[1,0], and C[0,0] + if (threadIdx.x == 0) { + C[3] = sum_squares - n_objs; // C[1,1] + + int temp = 0; + for (int i = 0; i < k; ++i) { + for (int j = 0; j < k; ++j) { + temp += (contingency[i * k + j]) * sum_cols[j]; + } + } + C[1] = temp - sum_squares; // C[0,1] + + temp = 0; + for (int i = 0; i < k; ++i) { + for (int j = 0; j < k; ++j) { + temp += (contingency[j * k + i]) * sum_rows[j]; + } + } + C[2] = temp - sum_squares; // C[1,0] + + C[0] = n_objs * n_objs - C[1] - C[2] - sum_squares; // C[0,0] + + + // compute ARI + int tn = static_cast(C[0]); + int fp = static_cast(C[1]); + int fn = static_cast(C[2]); + int tp = static_cast(C[3]); + float ari = 0.0; + if (fn == 0 && fp ==0) { + ari = 1.0; + } else { + ari = 2.0 * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)); + } + } + __syncthreads(); +} + +""" + +d_get_contingency_matrix_str = """ +/** + * @brief Compute the contingency matrix for two partitions using shared memory + * @param[in] part0 Pointer to the first partition array + * @param[in] part1 Pointer to the second partition array + * @param[in] n Number of elements in each partition array + * @param[out] shared_cont_mat Pointer to shared memory for storing the contingency matrix + * @param[in] k Maximum number of clusters (size of contingency matrix is k x k) + */ +__device__ void get_contingency_matrix(int* part0, int* part1, int n, int* shared_cont_mat, int k) { + int tid = threadIdx.x; + int bid = blockIdx.x; + int num_threads = blockDim.x; + int num_blocks = gridDim.x; + + // Initialize shared memory + for (int i = tid; i < k * k; i += num_threads) { + shared_cont_mat[i] = 0; + } + __syncthreads(); + + // Process elements + for (int i = tid; i < n; i += num_threads) { + int row = part0[i]; + int col = part1[i]; + + if (row < k && col < k) { + atomicAdd(&shared_cont_mat[row * k + col], 1); + } + } + __syncthreads(); +} + +""" + +d_unravel_index_str = """ +/** + * @brief Unravel a flat index to the corresponding 2D indicis + * @param[in] flat_idx The flat index to unravel + * @param[in] num_cols Number of columns in the 2D array + * @param[out] row Pointer to the row index + * @param[out] col Pointer to the column index + */ +extern "C" __device__ __host__ inline void unravel_index(int flat_idx, int num_cols, int* row, int* col) { + *row = flat_idx / num_cols; // Compute row index + *col = flat_idx % num_cols; // Compute column index +} + +""" + +d_get_coords_from_index_str = """ +#include +extern "C" __device__ __host__ inline void get_coords_from_index(int n_obj, int idx, int* x, int* y) { + // Calculate 'b' based on the input n_obj + int b = 1 - 2 * n_obj; + // Calculate 'x' using the quadratic formula part + float discriminant = b * b - 8 * idx; + float x_float = floor((-b - sqrt(discriminant)) / 2); + // Assign the integer part of 'x' + *x = static_cast(x_float); + // Calculate 'y' based on 'x' and the index + *y = static_cast(idx + (*x) * (b + (*x) + 2) / 2 + 1); +} + +""" + +k_ari_str = """ +/** + * @brief Main ARI kernel. Now only compare a pair of ARIs + * @param n_parts Number of partitions of each feature + * @param n_objs Number of objects in each partitions + * @param n_part_mat_elems Number of elements in the square partition matrix + * @param n_elems_per_feat Number of elements for each feature, i.e., part[i].x * part[i].y + * @param parts 3D Array of partitions with shape of (n_features, n_parts, n_objs) + * @param n_aris Number of ARIs to compute + * @param k The max value of cluster number + 1 + * @param out Output array of ARIs + * @param part_pairs Output array of part pairs to be compared by ARI + */ +extern "C" __global__ void ari(int *parts, + const int n_aris, + const int n_features, + const int n_parts, + const int n_objs, + const int n_elems_per_feat, + const int n_part_mat_elems, + const int k, + float *out, + int *part_pairs = nullptr) +{ + /* + * Step 1: Each thead, unravel flat indices and load the corresponding data into shared memory + */ + int global_tid = blockIdx.x * blockDim.x + threadIdx.x; + // each block is responsible for one ARI computation + int ari_block_idx = blockIdx.x; + + // print parts for debugging + + + // obtain the corresponding parts and unique counts + int feature_comp_flat_idx = ari_block_idx / n_part_mat_elems; // flat comparison pair index for two features + int part_pair_flat_idx = ari_block_idx % n_part_mat_elems; // flat comparison pair index for two partitions of one feature pair + int i, j; + + // unravel the feature indices + get_coords_from_index(n_features, feature_comp_flat_idx, &i, &j); + assert(i < n_features && j < n_features); + assert(i >= 0 && j >= 0); + + // unravel the partition indices + int m, n; + unravel_index(part_pair_flat_idx, n_parts, &m, &n); + // if (global_tid == 0) + + // Make pointers to select the parts and unique counts for the feature pair + // Todo: Use int4*? + int *t_data_part0 = parts + i * n_elems_per_feat + m * n_objs; // t_ for thread + int *t_data_part1 = parts + j * n_elems_per_feat + n * n_objs; + + // Load gmem data into smem by using different threads + extern __shared__ int shared_mem[]; + int *s_part0 = shared_mem; + int *s_part1 = shared_mem + n_objs; + + // Loop over the data using the block-stride pattern + for (int i = threadIdx.x; i < n_objs; i += blockDim.x) + { + s_part0[i] = t_data_part0[i]; + s_part1[i] = t_data_part1[i]; + } + __syncthreads(); + + // Copy data to global memory if part_pairs is specified + if (part_pairs != nullptr) + { + int *out_part0 = part_pairs + ari_block_idx * (2 * n_objs); + int *out_part1 = out_part0 + n_objs; + + for (int i = threadIdx.x; i < n_objs; i += blockDim.x) + { + out_part0[i] = s_part0[i]; + out_part1[i] = s_part1[i]; + } + } + + /* + * Step 2: Compute contingency matrix within the block + */ + // shared mem address for the contingency matrix + int *s_contingency = shared_mem + 2 * n_objs; + get_contingency_matrix(t_data_part0, t_data_part1, n_objs, s_contingency, k); + + /* + * Step 3: Construct pair confusion matrix + */ + // shared mem address for the pair confusion matrix + int *s_sum_rows = s_contingency + k * k; + int *s_sum_cols = s_sum_rows + k; + int *s_pair_confusion_matrix = s_sum_cols + k; + get_pair_confusion_matrix(s_contingency, s_sum_rows, s_sum_cols, n_objs, k, s_pair_confusion_matrix); + /* + * Step 4: Compute ARI and write to global memory + */ + if (threadIdx.x == 0) { + int tn = static_cast(s_pair_confusion_matrix[0]); + int fp = static_cast(s_pair_confusion_matrix[1]); + int fn = static_cast(s_pair_confusion_matrix[2]); + int tp = static_cast(s_pair_confusion_matrix[3]); + float ari = 0.0; + if (fn == 0 && fp == 0) { + ari = 1.0; + } else { + ari = 2.0 * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)); + } + out[ari_block_idx] = ari; + } + __syncthreads(); + +} + +""" + + +def get_kernel(): + """ + Kernel to compute the air between two partitions indexed from the 3D input array parts. + + The first thread of each logical part vs part ari matrix is responsible to reduce the matrix to the max ari. + See the document for illustrations. + + raw kernel args: + parts: 3D device array with cluster assignments for x features, y partitions, and z objects. + uniqs: 2D device array with the number of unique elements for feature x and partition y. + n_aris: Number of ARI computations to perform. + n_parts: Number of partitions of a feature, i.e., len(n_range_clusters) to compare. + out: Pointer to the pre-allocated 1D device output array with length of number of features to compare. + """ + + cuda_code = d_get_coords_from_index_str + k_ari_str + + kernel = cp.RawKernel(code=cuda_code, backend="nvcc").get_function("ari") + return kernel + + +def ari_dim2(feature_parts: cp.ndarray, + n_partitions: int, + n_features_comp: int, + out: cp.ndarray, + unique_element_counts: cp.ndarray): + """ + Function to compute the ARI between partitions on the GPU. This function is responsible for launching the kernel + in different streams for each pair of partitions. + + Args: + feature_parts: 3D device array with cluster assignments for x features, y partitions, and z objects. + Example initialization for this array: d_parts = cp.empty((nx, ny, nz), dtype=np.int16) - 1 + + n_partitions: Number of partitions of a feature to compare. + + n_features_comp: Pre-computed number of features to compare. + + out: Pointer to the pre-allocated 1D device output array with length of n_features_comp. + + unique_element_counts: 2D device array with the number of unique elements for feature x and partition y. + """ + + # Can use non-blocking CPU scheduling or CUDA dynamic parallelism to launch the kernel for each pair of partitions. + + # Get metadata + n_features, n_parts, n_objs = feature_parts.shape + + # Each kernel launch will be responsible for computing the ARI between two partitions. + n_part_mat_elems = n_partitions * n_partitions + # Each thread + n_ari_pairs = n_partitions * n_part_mat_elems + cm_values = cp.full(n_features_comp, cp.nan) + # Todo: how many ari pairs? n_range_cluster? + threads_per_block = 1 + blocks_per_grid = (n_ari_pairs + threads_per_block - 1) // threads_per_block + + ari_kernel = get_kernel() + # Todo: use different streams? + # Allocate output arrays for parts (debugging) + out_parts0 = cp.empty(n_objs, dtype=np.int32) + out_parts1 = cp.empty(n_objs, dtype=np.int32) + shared_mem_size = 2 * n_objs + + # Launch the kernel, using one block per ARI + ari_kernel(grid=(blocks_per_grid,), + block=(threads_per_block,), + shared_mem=shared_mem_size, + args=(feature_parts, unique_element_counts, n_features_comp, n_part_mat_elems, out)) + + raise NotImplementedError("Not implemented yet") diff --git a/libs/ccc_cuda_ext/CMakeLists.txt b/libs/ccc_cuda_ext/CMakeLists.txt new file mode 100644 index 00000000..babb604c --- /dev/null +++ b/libs/ccc_cuda_ext/CMakeLists.txt @@ -0,0 +1,48 @@ +# CMakeLists.txt + +cmake_minimum_required(VERSION 3.18) +project(CudaAriProject LANGUAGES CUDA CXX) + +# Set Python Standard +# Get Python version dynamically +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}')" + OUTPUT_VARIABLE PYTHON_VERSION_FULL + OUTPUT_STRIP_TRAILING_WHITESPACE +) +# Set the paths using the detected version +set(Python_EXECUTABLE $ENV{CONDA_PREFIX}/bin/python) +set(PYTHON_INCLUDE_DIR $ENV{CONDA_PREFIX}/include/python${PYTHON_VERSION_FULL}) +set(PYTHON_LIBRARY $ENV{CONDA_PREFIX}/lib/libpython${PYTHON_VERSION_FULL}.so) + +# Add gtest as a dependency +include(FetchContent) +FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/5ed21863955149a5a877a53d7d5045b6919090ed.zip +) +include(GoogleTest) + +# Set the C++ standard +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(PYBIND11_NEWPYTHON ON) +find_package(Python REQUIRED Development) +find_package(pybind11 CONFIG REQUIRED) + +# # Add the CUDA library +# add_library(cudaAriLib STATIC metrics.cu) # Add the CUDA source file +# set_target_properties(cudaAriLib PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + +# For Windows: Prevent overriding the parent project's compiler/linker settings +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) + +# Testing +enable_testing() + +add_executable(testCudaAri tests/test_kernel.cpp metrics.cu) +target_link_libraries(testCudaAri PUBLIC GTest::gtest_main GTest::gtest pybind11::embed Python::Python) +gtest_discover_tests(testCudaAri) + +# pybind11_add_module(testCudaAri tests/test_kernel.cpp) diff --git a/libs/ccc_cuda_ext/Readme.md b/libs/ccc_cuda_ext/Readme.md new file mode 100644 index 00000000..776c8d52 --- /dev/null +++ b/libs/ccc_cuda_ext/Readme.md @@ -0,0 +1,26 @@ +## How to build the CUDA module and its tests + +``` +# cd to libs/ccc_cuda_ext +cmake -S . -B build +cmake --build build +ctest --test-dir build --output-on-failure +``` + +## How to build and install this CUDA module +``` +conda activate ccc-rapid +pip install . + +# This will build the c++ module and install it in the current environment +``` + +## How to run C++ tests in tests/cuda_ext +The CMakeLists.txt file in the root directory will pick up the tests in tests/cuda_ext and build them. + +``` +for test in build/test_ari{,_py,_random}; do + echo "Running $test..." + ./$test +done +``` \ No newline at end of file diff --git a/libs/ccc_cuda_ext/binder.cu b/libs/ccc_cuda_ext/binder.cu new file mode 100644 index 00000000..048aa69a --- /dev/null +++ b/libs/ccc_cuda_ext/binder.cu @@ -0,0 +1,15 @@ + +#include +#include + +#include "metrics.cuh" + +namespace py = pybind11; + +using namespace pybind11::literals; + +PYBIND11_MODULE(ccc_cuda_ext, m) { + m.doc() = "CUDA extension module for CCC"; + m.def("ari_int32", &ari, "CUDA version of Adjusted Rand Index (ARI) calculation", + "parts"_a, "n_features"_a, "n_parts"_a, "n_objs"_a); +} diff --git a/libs/ccc_cuda_ext/cub.cu b/libs/ccc_cuda_ext/cub.cu new file mode 100644 index 00000000..da19f3df --- /dev/null +++ b/libs/ccc_cuda_ext/cub.cu @@ -0,0 +1,79 @@ +#include +#include + +// Define block and chunk sizes +const int BLOCK_SIZE = 256; +const int ITEMS_PER_THREAD = 4; +// Size of the shared memory buffer (chunk size) +const int SHARED_MEMORY_SIZE = BLOCK_SIZE * ITEMS_PER_THREAD; + +template +__global__ void streamProcessingKernel( + const T* input, + T* output, + const int totalElements +) { + // Shared memory buffer for the current chunk + __shared__ T sharedBuffer[SHARED_MEMORY_SIZE]; + + // Thread-local storage for loading elements + T threadData[ITEMS_PER_THREAD]; + + // Only one block should execute this kernel + if (blockIdx.x > 0) return; + + // Calculate number of chunks needed + const int numChunks = (totalElements + SHARED_MEMORY_SIZE - 1) / SHARED_MEMORY_SIZE; + + // Temporary storage for CUB operations + __shared__ typename cub::BlockLoad::TempStorage loadTemp; + + // Process data chunk by chunk + for (int chunk = 0; chunk < numChunks; chunk++) { + // Calculate offset and valid items for this chunk + const int chunkOffset = chunk * SHARED_MEMORY_SIZE; + const int validItems = min(SHARED_MEMORY_SIZE, totalElements - chunkOffset); + + // Load chunk from global memory + cub::BlockLoad(loadTemp).Load( + input + chunkOffset, + threadData, + validItems, + (T)0 // Default value for out-of-bounds items + ); + + // Process thread-local data (example: multiply by 2) + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; i++) { + threadData[i] *= 2; + } + + // Store processed data to shared memory + int threadOffset = threadIdx.x * ITEMS_PER_THREAD; + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; i++) { + if (threadOffset + i < validItems) { + sharedBuffer[threadOffset + i] = threadData[i]; + } + } + + __syncthreads(); + + // Additional processing on shared memory data if needed + // For example, you could do a reduction or other block-wide operations here + + // Store results back to global memory + for (int i = threadIdx.x; i < validItems; i += BLOCK_SIZE) { + output[chunkOffset + i] = sharedBuffer[i]; + } + + __syncthreads(); // Ensure all threads are done before loading next chunk + } +} + +// Host function to launch the kernel +template +void processLargeDataInOneBlock(const T* input, T* output, int totalElements) { + // Launch single block + streamProcessingKernel<<<1, BLOCK_SIZE>>>(input, output, totalElements); +} \ No newline at end of file diff --git a/libs/ccc_cuda_ext/metrics.cu b/libs/ccc_cuda_ext/metrics.cu new file mode 100644 index 00000000..f9ed7c61 --- /dev/null +++ b/libs/ccc_cuda_ext/metrics.cu @@ -0,0 +1,575 @@ +#include +#include + +#include +#include +#include +#include +#include + + +#include +#include +#include +#include "metrics.cuh" + +namespace py = pybind11; + +/** + * Future optimizations + * 1. use narrower data types + * 2. optimized on locality + * 3. use warp-level reduction + */ + + +// Todo: Add CudaCheckError +#define gpuErrorCheck(ans, abort) \ +{ \ + gpuAssert((ans), __FILE__, __LINE__, abort); \ +} +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) +{ + if (code != cudaSuccess) + { + fprintf(stderr, "assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) + { + exit(code); + } + } +} +// // call like this +// gpuErrorCheck(cudaMalloc(...)); // if fails, print message and continue +// gpuErrorCheck(cudaMalloc(...), true); // if fails, print message and abort + + +bool check_shared_memory_size(const size_t s_mem_size) +{ + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + const auto max_shared_mem = prop.sharedMemPerBlock; + return s_mem_size <= max_shared_mem; +} + + +/** + * @brief Unravel a flat index to the corresponding 2D indicis + * @param[in] flat_idx The flat index to unravel + * @param[in] num_cols Number of columns in the 2D array + * @param[out] row Pointer to the row index + * @param[out] col Pointer to the column index + */ +__device__ __host__ inline void unravel_index(int flat_idx, int num_cols, int *row, int *col) +{ + // change int to uint32_t + *row = flat_idx / num_cols; // Compute row index + *col = flat_idx % num_cols; // Compute column index +} + +/** + * @brief Given the number of objects and an index, this function calculates + * the coordinates in a symmetric matrix from a flat index. + * For example, if there are n_obj objects (such as genes), a condensed + * 1D array can be created with pairwise comparisons between these + * objects, which corresponds to a symmetric 2D matrix. This function + * calculates the 2D coordinates (x, y) in the symmetric matrix that + * corresponds to the given flat index. + * + * @param[in] n_obj The total number of objects (i.e., the size of one dimension + * of the square symmetric matrix). + * @param[in] idx The flat index from the condensed pairwise array. + * @param[out] x Pointer to the calculated row coordinate in the symmetric matrix. + * @param[out] y Pointer to the calculated column coordinate in the symmetric matrix. + */ +__device__ __host__ inline void get_coords_from_index(int n_obj, int idx, int *x, int *y) +{ + // Calculate 'b' based on the input n_obj + int b = 1 - 2 * n_obj; + // Calculate 'x' using the quadratic formula part + float discriminant = b * b - 8 * idx; + float x_float = floor((-b - sqrt(discriminant)) / 2); + // Assign the integer part of 'x' + *x = static_cast(x_float); + // Calculate 'y' based on 'x' and the index + *y = static_cast(idx + (*x) * (b + (*x) + 2) / 2 + 1); +} + +/** + * @brief Compute the contingency matrix for two partitions using shared memory + * @param[in] part0 Pointer to the first partition array, global memory + * @param[in] part1 Pointer to the second partition array, global memory + * @param[in] n Number of elements in each partition array + * @param[out] shared_cont_mat Pointer to shared memory for storing the contingency matrix + * @param[in] k Maximum number of clusters (size of contingency matrix is k x k) + */ +__device__ void get_contingency_matrix(int *part0, int *part1, int n, int *shared_cont_mat, int k) +{ + int tid = threadIdx.x; + int num_threads = blockDim.x; + int size = k * k; + + // Initialize shared memory + for (int i = tid; i < size; i += num_threads) + { + shared_cont_mat[i] = 0; + } + __syncthreads(); + + // Process elements with bounds checking + for (int i = tid; i < n; i += num_threads) + { + int row = part0[i]; + int col = part1[i]; + + // Add bounds checking + if (row >= 0 && row < k && col >= 0 && col < k) + { + atomicAdd(&shared_cont_mat[row * k + col], 1); + } + } + __syncthreads(); +} + + +/** + * @brief Compute the contingency matrix for two partitions using shared memory, by loading global memory data in batch + * to process large input, i.e., when the input size is larger than the shared memory size + * @param[in] part0 Pointer to the first partition array int the global memory + * @param[in] part1 Pointer to the second partition array in the global memory + * @param[in] nSamples Number of elements in each partition array + * @param[in] k Maximum number of clusters (size of contingency matrix is k x k) + * @param[out] shared_cont_mat Pointer to shared memory for storing the contingency matrix + */ +// Todo: Add template for kernel configuration +template +__device__ void get_contingency_matrix_batch(const T* part0, const T* part1, const int n_objs, const int k, T* shared_cont_mat) +{ + // Define block and chunk sizes + const int BLOCK_SIZE = 256; + const int ITEMS_PER_THREAD = 4; + // Size of the shared memory buffer (chunk size) + const int SHARED_MEMORY_SIZE = 2 * BLOCK_SIZE * ITEMS_PER_THREAD; + + int tid = threadIdx.x; + int num_threads = blockDim.x; + const auto cont_mat_size = k * k; + + // Shared memory buffer for the current chunk + __shared__ T sharedBuffer[SHARED_MEMORY_SIZE]; + // Thread-local storage for loading elements + T threadData_part0[ITEMS_PER_THREAD]; + T threadData_part1[ITEMS_PER_THREAD]; + + // Calculate number of chunks needed + const int numChunks = (n_objs + SHARED_MEMORY_SIZE - 1) / SHARED_MEMORY_SIZE; + // Temporary storage for CUB operations + // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + using BlockLoad = cub::BlockLoad; + // Allocate shared memory for BlockLoad + __shared__ typename BlockLoad::TempStorage temp_storage_part0; + __shared__ typename BlockLoad::TempStorage temp_storage_part1; + + // Initialize shared memory for the contingency matrix + for (int i = tid; i < cont_mat_size; i += num_threads) + { + shared_cont_mat[i] = 0; + } + __syncthreads(); + + // Process data chunk by chunk + for (int chunk = 0; chunk < numChunks; chunk++) { + // Calculate offset and valid items for this chunk + const int chunkOffset = chunk * SHARED_MEMORY_SIZE; + const int validItems = min(SHARED_MEMORY_SIZE, n_objs - chunkOffset); + + // Load chunk from global memory + cub::BlockLoad(temp_storage_part0).Load( + part0 + chunkOffset, + threadData_part0, + validItems, + (T)0 // Default value for out-of-bounds items + ); + + cub::BlockLoad(temp_storage_part1).Load( + part1 + chunkOffset, + threadData_part1, + validItems, + (T)0 // Default value for out-of-bounds items + ); + + // Process thread-local data + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; i++) { + // threadData[i] *= 2; + const T p0_label = part0[i]; + const T p1_label = part1[i]; + // Add bounds checking + if (p0_label >= 0 && p0_label < k && p1_label >= 0 && p1_label < k) + { + atomicAdd(&shared_cont_mat[p0_label * k + p1_label], 1); + } + } + + // // Store processed data to shared memory + // int threadOffset = threadIdx.x * ITEMS_PER_THREAD; + // #pragma unroll + // for (int i = 0; i < ITEMS_PER_THREAD; i++) { + // if (threadOffset + i < validItems) { + // sharedBuffer[threadOffset + i] = threadData[i]; + // } + // } + + // __syncthreads(); + + // // Additional processing on shared memory data if needed + // // For example, you could do a reduction or other block-wide operations here + + // // Store results back to global memory + // for (int i = threadIdx.x; i < validItems; i += BLOCK_SIZE) { + // output[chunkOffset + i] = sharedBuffer[i]; + // } + + // __syncthreads(); // Ensure all threads are done before loading next chunk + } + + // Process elements with bounds checking + // for (int i = tid; i < n_samples; i += num_threads) + // { + // int row = part0[i]; + // int col = part1[i]; + + // // Add bounds checking + // if (row >= 0 && row < k && col >= 0 && col < k) + // { + // atomicAdd(&shared_cont_mat[row * k + col], 1); + // } + // } + // __syncthreads(); +} + +/** + * @brief CUDA device function to compute the pair confusion matrix + * @param[in] contingency Pointer to the contingency matrix + * @param[in] sum_rows Pointer to the sum of rows in the contingency matrix + * @param[in] sum_cols Pointer to the sum of columns in the contingency matrix + * @param[in] n_objs Number of objects in each partition + * @param[in] k Number of clusters (assuming k is the max of clusters in part0 and part1) + * @param[out] C Pointer to the output pair confusion matrix (2x2) + */ +__device__ void get_pair_confusion_matrix( + const int *__restrict__ contingency, + int *sum_rows, + int *sum_cols, + const int n_objs, + const int k, + int *C) +{ + // Initialize sum_rows and sum_cols + for (int i = threadIdx.x; i < k; i += blockDim.x) + { + sum_rows[i] = 0; + sum_cols[i] = 0; + } + __syncthreads(); + + // Compute sum_rows and sum_cols + for (int i = threadIdx.x; i < k * k; i += blockDim.x) + { + int row = i / k; + int col = i % k; + int val = contingency[i]; + atomicAdd(&sum_cols[col], val); + atomicAdd(&sum_rows[row], val); + } + __syncthreads(); + + // Compute sum_squares + int sum_squares; + if (threadIdx.x == 0) + { + sum_squares = 0; + for (int i = 0; i < k * k; ++i) + { + sum_squares += (contingency[i]) * contingency[i]; + } + } + __syncthreads(); + // printf("sum_squares: %d\n", sum_squares); + + // Compute C[1,1], C[0,1], C[1,0], and C[0,0] + if (threadIdx.x == 0) + { + C[3] = sum_squares - n_objs; // C[1,1] + + int temp = 0; + for (int i = 0; i < k; ++i) + { + for (int j = 0; j < k; ++j) + { + temp += (contingency[i * k + j]) * sum_cols[j]; + } + } + C[1] = temp - sum_squares; // C[0,1] + + temp = 0; + for (int i = 0; i < k; ++i) + { + for (int j = 0; j < k; ++j) + { + temp += (contingency[j * k + i]) * sum_rows[j]; + } + } + C[2] = temp - sum_squares; // C[1,0] + + C[0] = n_objs * n_objs - C[1] - C[2] - sum_squares; // C[0,0] + } +} + +/** + * @brief Main ARI kernel. Now only compare a pair of ARIs + * @param n_parts Number of partitions of each feature + * @param n_objs Number of objects in each partitions + * @param n_part_mat_elems Number of elements in the square partition matrix + * @param n_elems_per_feat Number of elements for each feature, i.e., part[i].x * part[i].y + * @param parts 3D Array of partitions with shape of (n_features, n_parts, n_objs) + * @param n_aris Number of ARIs to compute + * @param k The max value of cluster number + 1 + * @param out Output array of ARIs + */ +extern "C" +__global__ void ari(int *parts, + const int n_aris, + const int n_features, + const int n_parts, + const int n_objs, + const int n_elems_per_feat, + const int n_part_mat_elems, + const int k, + float *out + ) +{ + /* + * Step 0: Compute shared memory addresses + */ + extern __shared__ int shared_mem[]; + // NOTE: comment out the following lines for now + // int *s_part0 = shared_mem; // n_objs elements + // int *s_part1 = s_part0 + n_objs; // n_objs elements + // int *s_contingency = s_part1 + n_objs; // k * k elements + // NOTE Ends + int *s_contingency = shared_mem; // k * k elements + int *s_sum_rows = s_contingency + (k * k); // k elements + int *s_sum_cols = s_sum_rows + k; // k elements + int *s_pair_confusion_matrix = s_sum_cols + k; // 4 elements + + /* + * Step 1: Each thead, unravel flat indices and load the corresponding data into shared memory + */ + // each block is responsible for one ARI computation + int ari_block_idx = blockIdx.x; + // obtain the corresponding parts and unique counts + int feature_comp_flat_idx = ari_block_idx / n_part_mat_elems; // flat comparison pair index for two features + int part_pair_flat_idx = ari_block_idx % n_part_mat_elems; // flat comparison pair index for two partitions of one feature pair + int i, j; + // unravel the feature indices + get_coords_from_index(n_features, feature_comp_flat_idx, &i, &j); + assert(i < n_features && j < n_features); + assert(i >= 0 && j >= 0); + // unravel the partition indices + int m, n; + unravel_index(part_pair_flat_idx, n_parts, &m, &n); + // Make pointers to select the parts and unique counts for the feature pair + // Todo: Use int4*? + int *t_data_part0 = parts + i * n_elems_per_feat + m * n_objs; // t_ for thread + int *t_data_part1 = parts + j * n_elems_per_feat + n * n_objs; + + // Load gmem data into smem by using different threads + // extern __shared__ int shared_mem[]; + // int *s_part0 = shared_mem; + // int *s_part1 = shared_mem + n_objs; + + // NOTE: comment out the following lines for now + // Loop over the data using the block-stride pattern + // for (int i = threadIdx.x; i < n_objs; i += blockDim.x) + // { + // s_part0[i] = t_data_part0[i]; + // s_part1[i] = t_data_part1[i]; + // } + // __syncthreads(); + // NOTE Ends + + /* + * Step 2: Compute contingency matrix within the block + */ + // shared mem address for the contingency matrix + // int *s_contingency = shared_mem + 2 * n_objs; + get_contingency_matrix(t_data_part0, t_data_part1, n_objs, s_contingency, k); + + /* + * Step 3: Construct pair confusion matrix + */ + // shared mem address for the pair confusion matrix + // int *s_sum_rows = s_contingency + k * k; + // int *s_sum_cols = s_sum_rows + k; + // int *s_pair_confusion_matrix = s_sum_cols + k; + get_pair_confusion_matrix(s_contingency, s_sum_rows, s_sum_cols, n_objs, k, s_pair_confusion_matrix); + /* + * Step 4: Compute ARI and write to global memory + */ + if (threadIdx.x == 0) + { + int tn = static_cast(s_pair_confusion_matrix[0]); + int fp = static_cast(s_pair_confusion_matrix[1]); + int fn = static_cast(s_pair_confusion_matrix[2]); + int tp = static_cast(s_pair_confusion_matrix[3]); + float ari = 0.0; + if (fn == 0 && fp == 0) + { + ari = 1.0; + } + else + { + ari = 2.0 * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)); + } + out[ari_block_idx] = ari; + } + __syncthreads(); +} + +/** + * @brief Internal lower-level ARI computation + * @param parts pointer to the 3D Array of partitions with shape of (n_features, n_parts, n_objs) + * @throws std::invalid_argument if "parts" is invalid + * @return std::vector ARI values for each pair of partitions + */ +template +auto ari_core(const T* parts, + const size_t n_features, + const size_t n_parts, + const size_t n_objs) -> std::vector { + /* + * Notes for future bug fixing and optimization + */ + // 1. GPU memory is not enough to store the partitions -> split the partitions into smaller chunks and do stream processing + + // Input validation + if (!parts || n_features == 0 || n_parts == 0 || n_objs == 0) { + throw std::invalid_argument("Invalid input parameters"); + } + + /* + * Pre-computation + */ + // Todo: dynamically query types + using parts_dtype = T; + using out_dtype = float; + // Compute internal variables + const auto n_feature_comp = n_features * (n_features - 1) / 2; + const auto n_aris = n_feature_comp * n_parts * n_parts; + + /* + * Memory Allocation + */ + // Allocate host memory + thrust::host_vector h_out(n_aris); + thrust::host_vector h_parts_pairs(n_aris * 2 * n_objs); + // Allocate device memory with thrust + // const int* parts_raw = parts[0][0].data(); + thrust::device_vector d_parts(parts, parts + n_features * n_parts * n_objs); // data is copied to device + thrust::device_vector d_out(n_aris); + + // Set up CUDA kernel configuration + const auto block_size = 256; // Todo: query device for max threads per block, older devices only support 512 threads per 1D block + // Each block is responsible for one ARI computation + const auto grid_size = n_aris; + + // Define shared memory size for each block + // Compute k, the maximum value in d_parts + 1, used for shared memory allocation later + const auto k = thrust::reduce(d_parts.begin(), d_parts.end(), -1, thrust::maximum()) + 1; + const auto sz_parts_dtype = sizeof(parts_dtype); + // Compute shared memory size + // FIXME: Partition pair size should be fixed. Stream processing should be used for large input + // NOTE: Use global memory to fix the issue for now and then optimize with shared memory + // auto s_mem_size = 2 * n_objs * sz_parts_dtype; // For the partition pair to be compared + auto s_mem_size = 0; + s_mem_size += k * k * sz_parts_dtype; // For contingency matrix + s_mem_size += 2 * n_parts * sz_parts_dtype; // For the internal sum arrays + s_mem_size += 4 * sz_parts_dtype; // For the 2 x 2 confusion matrix + + // Check if shared memory size exceeds device limits + if (!check_shared_memory_size(s_mem_size)) { + throw std::runtime_error("Required shared memory exceeds device limits"); + } + + /* + * Launch the kernel + */ + ari<<>>( + thrust::raw_pointer_cast(d_parts.data()), + n_aris, + n_features, + n_parts, + n_objs, + n_parts * n_objs, + n_parts * n_parts, + k, + thrust::raw_pointer_cast(d_out.data())); + + // Copy data back to host + thrust::copy(d_out.begin(), d_out.end(), h_out.begin()); + + // Copy data to std::vector + std::vector res; + thrust::copy(h_out.begin(), h_out.end(), std::back_inserter(res)); + + // Free device memory + + // Return the ARI values + return res; +} + +/** + * @brief API exposed to Python for computing ARI using CUDA upon a 3D Numpy NDArray of partitions + * @param parts 3D Numpy.NDArray of partitions with shape of (n_features, n_parts, n_objs) + * @throws std::invalid_argument if "parts" is invalid + * @return std::vector ARI values for each pair of partitions + */ +template +auto ari(const py::array_t& parts, + const size_t n_features, + const size_t n_parts, + const size_t n_objs) -> std::vector { + // Edge cases: + // 1. GPU memory is not enough to store the partitions -> split the partitions into smaller chunks and do stream processing + + // Input processing + // Request a buffer descriptor from Python + py::buffer_info buffer = parts.request(); + + // Some basic validation checks ... + if (buffer.format != py::format_descriptor::format()) + throw std::runtime_error("Incompatible format: expected an int array!"); + + if (buffer.ndim != 3) + throw std::runtime_error("Incompatible buffer dimension!"); + + // Apply resources + auto result = py::array_t(buffer.size); + + // Obtain numpy.ndarray data pointer + const auto parts_ptr = static_cast(buffer.ptr); + + return ari_core(parts_ptr, n_features, n_parts, n_objs); +} + + +// Below is the explicit instantiation of the ari template function. +// +// Generally people would write the implementation of template classes and functions in the header file. However, we +// separate the implementation into a .cpp file to make things clearer. In order to make the compiler know the +// implementation of the template functions, we need to explicitly instantiate them here, so that they can be picked up +// by the linker. + +template auto ari(const py::array_t& parts, const size_t n_features, const size_t n_parts, const size_t n_objs) -> std::vector; +template auto ari_core(const int* parts, const size_t n_features, const size_t n_parts, const size_t n_objs) -> std::vector; diff --git a/libs/ccc_cuda_ext/metrics.cuh b/libs/ccc_cuda_ext/metrics.cuh new file mode 100644 index 00000000..b03eab9f --- /dev/null +++ b/libs/ccc_cuda_ext/metrics.cuh @@ -0,0 +1,20 @@ +#pragma once + +#include +#include + + +namespace py = pybind11; + +template +auto ari(const py::array_t& parts, + const size_t n_features, + const size_t n_parts, + const size_t n_objs) -> std::vector; + +// Used for internal c++ testing +template +auto ari_core(const T* parts, + const size_t n_features, + const size_t n_parts, + const size_t n_objs) -> std::vector; diff --git a/libs/ccc_cuda_ext/tests/hello_test.cc b/libs/ccc_cuda_ext/tests/hello_test.cc new file mode 100644 index 00000000..5a57e138 --- /dev/null +++ b/libs/ccc_cuda_ext/tests/hello_test.cc @@ -0,0 +1,9 @@ +#include + +// Demonstrate some basic assertions. +TEST(HelloTest, BasicAssertions) { + // Expect two strings not to be equal. + EXPECT_STRNE("hello", "world"); + // Expect equality. + EXPECT_EQ(7 * 6, 42); +} diff --git a/libs/ccc_cuda_ext/tests/test_binder.py b/libs/ccc_cuda_ext/tests/test_binder.py new file mode 100644 index 00000000..b4a42687 --- /dev/null +++ b/libs/ccc_cuda_ext/tests/test_binder.py @@ -0,0 +1,12 @@ +import cuda_ccc +import inspect +import numpy as np + + +parts = np.array([[[0, 1, 2]], [[0, 1, 2]], [[0, 1, 2]]], dtype=np.int32, order="C") +print(parts.ndim) +n_features = 3 +n_parts = 1 +n_samples = 3 +r = cuda_ccc.ari(parts, n_samples, n_features, n_parts) +print(r) diff --git a/libs/ccc_cuda_ext/tests/test_kernel.cpp b/libs/ccc_cuda_ext/tests/test_kernel.cpp new file mode 100644 index 00000000..6e907ead --- /dev/null +++ b/libs/ccc_cuda_ext/tests/test_kernel.cpp @@ -0,0 +1,30 @@ +#include +#include +// #include "../metrics.cuh" + +namespace py = pybind11; + +TEST(AriTest, SimpleCase) { + // Create input data + std::vector data = { + 0, 0, 1, 2, // First partition + 0, 0, 1, 1 // Second partition + }; + + // Create shape and strides for 3D array (n_features=2, n_parts=1, n_objs=4) + std::vector shape = {2, 1, 4}; + // std::vector strides = {4 * sizeof(int), // stride for features + // 4 * sizeof(int), // stride for partitions + // sizeof(int)}; // stride for objects + + // // Create numpy array from data + // py::array_t parts(shape, strides, data.data()); + py::array_t arr({ 3, 5 }); + + // Call the ari function + // std::vector result = ari(parts, 2, 1, 4); + + // Check result + // ASSERT_EQ(result.size(), 1); // Should only have one ARI value + // EXPECT_NEAR(result[0], 0.57f, 1e-2); // Compare with expected value within tolerance +} \ No newline at end of file diff --git a/libs/ccc_cuda_ext/tests/test_partition_pairing.cpp b/libs/ccc_cuda_ext/tests/test_partition_pairing.cpp new file mode 100644 index 00000000..5e6b75b9 --- /dev/null +++ b/libs/ccc_cuda_ext/tests/test_partition_pairing.cpp @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include "metrics.cuh" + +// Helper function to generate pairwise combinations (implement this according to your needs) +std::vector, std::vector>> generate_pairwise_combinations(const std::vector>> &arr) +{ + std::vector, std::vector>> pairs; + size_t num_slices = arr.size(); // Number of 2D arrays in the 3D vector + for (size_t i = 0; i < num_slices; ++i) + { + for (size_t j = i + 1; j < num_slices; ++j) + { // Only consider pairs in different slices + for (const auto &row_i : arr[i]) + { // Each row in slice i + for (const auto &row_j : arr[j]) + { // Pairs with each row in slice j + pairs.emplace_back(row_i, row_j); + } + } + } + } + return pairs; +} + +void test_ari_parts_selection() +{ + // Define test input + std::vector>> parts = { + {{0, 1, 2, 3}, + {0, 2, 3, 4}, + {0, 3, 4, 5}}, + {{1, 1, 2, 3}, + {1, 2, 3, 4}, + {1, 3, 4, 5}}, + {{2, 1, 2, 3}, + {2, 2, 3, 4}, + {2, 3, 4, 5}}}; + + const int k = 6; // specified by the call to ccc , part number from [0...9] + + // std::vector>> parts = { + // {{4, 1, 3, 5, 2, 0, 6, 3, 1, 4}, + // {0, 2, 6, 4, 5, 3, 1, 0, 6, 2}, + // {1, 5, 3, 2, 4, 0, 6, 1, 5, 3}}, + + // // {{3, 6, 0, 2, 1, 5, 4, 3, 6, 0}, + // // {5, 1, 4, 0, 3, 6, 2, 1, 5, 4}, + // // {2, 3, 6, 1, 0, 5, 4, 3, 6, 2}}, + + // {{1, 4, 5, 3, 6, 0, 2, 5, 4, 1}, + // {0, 6, 2, 5, 1, 3, 4, 6, 0, 2}, + // {4, 1, 3, 6, 5, 0, 2, 4, 1, 3}} + // }; + + // const int k = 7; // specified by the call to ccc , max(parts) + 1 + + // std::vector part_maxes = {3, 4, 5, 3, 4, 5, 3, 4, 5}; + // auto sz_part_maxes = sizeof(part_maxes) / sizeof(part_maxes[0]); + + // Get dimensions + int n_features = parts.size(); + int n_parts = parts[0].size(); + int n_objs = parts[0][0].size(); + int n_feature_comp = n_features * (n_features - 1) / 2; + int n_aris = n_feature_comp * n_parts * n_parts; + std::cout << "n_features: " << n_features << ", n_parts: " << n_parts << ", n_objs: " << n_objs << std::endl + << "n_feature_comps: " << n_feature_comp << ", n_aris: " << n_aris << std::endl; + + // Allocate host memory for C-style array + int *h_parts = new int[n_features * n_parts * n_objs]; + + // Copy data from vector to C-style array + for (int i = 0; i < n_features; ++i) + { + for (int j = 0; j < n_parts; ++j) + { + for (int k = 0; k < n_objs; ++k) + { + h_parts[i * (n_parts * n_objs) + j * n_objs + k] = parts[i][j][k]; + } + } + } + + // Set up CUDA kernel configuration + int block_size = 2; + // Each block is responsible for one ARI computation + int grid_size = n_aris; + // Compute shared memory size + size_t s_mem_size = n_objs * 2 * sizeof(int); // For the partition pair to be compared + s_mem_size += 2 * k * sizeof(int); // For the internal sum arrays + s_mem_size += 4 * sizeof(int); // For the 2 x 2 confusion matrix + + // Allocate device memory + int *d_parts, *d_parts_pairs; + float *d_out; + cudaMalloc(&d_parts, n_features * n_parts * n_objs * sizeof(int)); + cudaMalloc(&d_out, n_aris * sizeof(float)); + cudaMalloc(&d_parts_pairs, n_aris * 2 * n_objs * sizeof(int)); + + // Copy data to device + cudaMemcpy(d_parts, h_parts, n_features * n_parts * n_objs * sizeof(int), cudaMemcpyHostToDevice); + + // Launch kernel + ari<<>>( + d_parts, + n_aris, + n_features, + n_parts, + n_objs, + n_parts * n_objs, + n_parts * n_parts, + k, + d_out, + d_parts_pairs); + + // Synchronize device + cudaDeviceSynchronize(); + + // Copy results back to host + int *h_parts_pairs = new int[n_aris * 2 * n_objs]; + cudaMemcpy(h_parts_pairs, d_parts_pairs, n_aris * 2 * n_objs * sizeof(int), cudaMemcpyDeviceToHost); + + // Print results + std::cout << "Parts pairs: " << std::endl; + for (int i = 0; i < n_aris; ++i) + { + std::cout << "Pair:" << i << std::endl; + for (int j = 0; j < 2; ++j) + { + for (int k = 0; k < n_objs; ++k) + { + std::cout << *(h_parts_pairs + i * 2 * n_objs + j * n_objs + k) << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + std::cout << std::endl; + + // Assert equality on the parts pairs + bool all_equal = true; + auto pairs = generate_pairwise_combinations(parts); + int n_pairs = pairs.size(); + for (int i = 0; i < n_pairs; ++i) + { + for (int j = 0; j < 2; ++j) + { + const std::vector ¤t_vector = (j == 0) ? pairs[i].first : pairs[i].second; + for (int k = 0; k < n_objs; ++k) + { + int flattened_index = i * 2 * n_objs + j * n_objs + k; + if (h_parts_pairs[flattened_index] != current_vector[k]) + { + all_equal = false; + std::cout << "Mismatch at i=" << i << ", j=" << j << ", k=" << k << std::endl; + std::cout << "Expected: " << current_vector[k] << ", Got: " << h_parts_pairs[flattened_index] << std::endl; + } + } + } + } + + if (all_equal) + { + std::cout << "Test passed: All elements match." << std::endl; + } + else + { + std::cout << "Test failed: Mismatches found." << std::endl; + } + + // Print ARI results + float *h_out = new float[n_aris]; + cudaMemcpy(h_out, d_out, n_aris * sizeof(float), cudaMemcpyDeviceToHost); + std::cout << "ARI results: " << std::endl; + for (int i = 0; i < n_aris; ++i) + { + printf("%f, ", h_out[i]); + } + std::cout << std::endl; + + // Clean up + cudaFree(d_parts); + cudaFree(d_out); + cudaFree(d_parts_pairs); + delete[] h_parts_pairs; +} + +int main() +{ + test_ari_parts_selection(); + return 0; +} \ No newline at end of file diff --git a/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/00_cpu_1_thread.ipynb b/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/00_cpu_1_thread.ipynb new file mode 100644 index 00000000..d8d54875 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/00_cpu_1_thread.ipynb @@ -0,0 +1,1450 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Multi-threading version of code in `09`" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from ccc.coef import ccc" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "c0341a42-b8de-419f-ab37-1e4fee9dde75", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.416790Z", + "iopub.status.busy": "2021-12-02T04:34:11.416353Z", + "iopub.status.idle": "2021-12-02T04:34:11.418488Z", + "shell.execute_reply": "2021-12-02T04:34:11.418030Z" + }, + "papermill": { + "duration": 0.098372, + "end_time": "2021-12-02T04:34:11.418578", + "exception": false, + "start_time": "2021-12-02T04:34:11.320206", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "np.random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "d63012be-4fc7-4fba-bccd-ad155905d1d6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.418916Z", + "iopub.status.busy": "2021-12-02T04:34:21.418477Z", + "iopub.status.idle": "2021-12-02T04:34:21.420269Z", + "shell.execute_reply": "2021-12-02T04:34:21.420631Z" + }, + "papermill": { + "duration": 0.098786, + "end_time": "2021-12-02T04:34:21.420745", + "exception": false, + "start_time": "2021-12-02T04:34:21.321959", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8.49 ms ± 94.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_50.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 6144 function calls in 0.012 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.012 0.012 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.012 0.012 :1()\n", + " 1 0.000 0.000 0.012 0.012 2661685993.py:1(func)\n", + " 10 0.000 0.000 0.012 0.001 impl.py:307(ccc)\n", + " 10 0.000 0.000 0.008 0.001 impl.py:492(compute_coef)\n", + " 140 0.000 0.000 0.008 0.000 threading.py:280(wait)\n", + " 550 0.008 0.000 0.008 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.008 0.001 impl.py:485(cdist_func)\n", + " 10 0.000 0.000 0.007 0.001 impl.py:192(cdist_parts_parallel)\n", + " 70 0.000 0.000 0.007 0.000 threading.py:563(wait)\n", + " 70 0.000 0.000 0.006 0.000 _base.py:201(as_completed)\n", + " 70 0.000 0.000 0.002 0.000 thread.py:161(submit)\n", + " 70 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.001 0.000 threading.py:880(start)\n", + " 70 0.000 0.000 0.001 0.000 _base.py:418(result)\n", + " 20 0.000 0.000 0.001 0.000 _base.py:602(result_iterator)\n", + " 10 0.000 0.000 0.001 0.000 impl.py:210()\n", + " 10 0.000 0.000 0.000 0.000 _base.py:636(__exit__)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_50.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "18.7 ms ± 133 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_100.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8334 function calls in 0.020 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.020 0.020 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.020 0.020 :1()\n", + " 1 0.000 0.000 0.020 0.020 2661685993.py:1(func)\n", + " 10 0.000 0.000 0.020 0.002 impl.py:307(ccc)\n", + " 200 0.000 0.000 0.016 0.000 threading.py:280(wait)\n", + " 10 0.000 0.000 0.016 0.002 impl.py:492(compute_coef)\n", + " 790 0.016 0.000 0.016 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.015 0.002 impl.py:485(cdist_func)\n", + " 10 0.000 0.000 0.015 0.002 impl.py:192(cdist_parts_parallel)\n", + " 100 0.000 0.000 0.014 0.000 _base.py:201(as_completed)\n", + " 100 0.000 0.000 0.014 0.000 threading.py:563(wait)\n", + " 100 0.000 0.000 0.002 0.000 _base.py:418(result)\n", + " 20 0.000 0.000 0.002 0.000 _base.py:602(result_iterator)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.001 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.001 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.001 0.000 impl.py:210()\n", + " 10 0.000 0.000 0.001 0.000 threading.py:880(start)\n", + " 100 0.000 0.000 0.000 0.000 threading.py:411(acquire)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_100.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "33 ms ± 219 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_500.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8334 function calls in 0.037 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.037 0.037 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.037 0.037 :1()\n", + " 1 0.000 0.000 0.037 0.037 2661685993.py:1(func)\n", + " 10 0.000 0.000 0.037 0.004 impl.py:307(ccc)\n", + " 200 0.000 0.000 0.032 0.000 threading.py:280(wait)\n", + " 790 0.032 0.000 0.032 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.026 0.003 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.025 0.003 impl.py:485(cdist_func)\n", + " 10 0.000 0.000 0.025 0.003 impl.py:192(cdist_parts_parallel)\n", + " 100 0.000 0.000 0.024 0.000 threading.py:563(wait)\n", + " 100 0.000 0.000 0.024 0.000 _base.py:201(as_completed)\n", + " 100 0.000 0.000 0.008 0.000 _base.py:418(result)\n", + " 20 0.000 0.000 0.008 0.000 _base.py:602(result_iterator)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.001 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.001 0.000 threading.py:880(start)\n", + " 10 0.000 0.000 0.001 0.000 impl.py:210()\n", + " 100 0.000 0.000 0.000 0.000 threading.py:411(acquire)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_500.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "53.9 ms ± 347 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_1000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8334 function calls in 0.057 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.057 0.057 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.057 0.057 :1()\n", + " 1 0.000 0.000 0.057 0.057 2661685993.py:1(func)\n", + " 10 0.001 0.000 0.057 0.006 impl.py:307(ccc)\n", + " 200 0.000 0.000 0.052 0.000 threading.py:280(wait)\n", + " 790 0.052 0.000 0.052 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.042 0.004 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.042 0.004 impl.py:485(cdist_func)\n", + " 10 0.000 0.000 0.042 0.004 impl.py:192(cdist_parts_parallel)\n", + " 100 0.000 0.000 0.040 0.000 _base.py:201(as_completed)\n", + " 100 0.000 0.000 0.040 0.000 threading.py:563(wait)\n", + " 100 0.000 0.000 0.012 0.000 _base.py:418(result)\n", + " 20 0.000 0.000 0.012 0.001 _base.py:602(result_iterator)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.001 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.001 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.001 0.000 impl.py:210()\n", + " 10 0.000 0.000 0.001 0.000 threading.py:880(start)\n", + " 100 0.000 0.000 0.000 0.000 threading.py:411(acquire)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_1000.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.96 s ± 15.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_large_50000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8334 function calls in 2.979 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 2.979 2.979 {built-in method builtins.exec}\n", + " 1 0.000 0.000 2.979 2.979 :1()\n", + " 1 0.000 0.000 2.979 2.979 2661685993.py:1(func)\n", + " 10 0.005 0.000 2.979 0.298 impl.py:307(ccc)\n", + " 200 0.001 0.000 2.954 0.015 threading.py:280(wait)\n", + " 790 2.953 0.004 2.953 0.004 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.001 0.000 2.109 0.211 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 2.107 0.211 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 2.107 0.211 impl.py:192(cdist_parts_parallel)\n", + " 100 0.001 0.000 2.101 0.021 _base.py:201(as_completed)\n", + " 100 0.000 0.000 2.100 0.021 threading.py:563(wait)\n", + " 100 0.000 0.000 0.855 0.009 _base.py:418(result)\n", + " 20 0.000 0.000 0.855 0.043 _base.py:602(result_iterator)\n", + " 50 0.007 0.000 0.007 0.000 {built-in method numpy.zeros}\n", + " 10 0.003 0.000 0.005 0.000 impl.py:210()\n", + " 100 0.000 0.000 0.003 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 threading.py:880(start)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_large_50000.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6.28 s ± 143 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_large_100000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8334 function calls in 6.423 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 6.423 6.423 {built-in method builtins.exec}\n", + " 1 0.000 0.000 6.423 6.423 :1()\n", + " 1 0.011 0.011 6.423 6.423 2661685993.py:1(func)\n", + " 10 0.007 0.001 6.412 0.641 impl.py:307(ccc)\n", + " 200 0.001 0.000 6.385 0.032 threading.py:280(wait)\n", + " 790 6.384 0.008 6.384 0.008 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.001 0.000 4.487 0.449 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 4.486 0.449 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 4.486 0.449 impl.py:192(cdist_parts_parallel)\n", + " 100 0.001 0.000 4.480 0.045 _base.py:201(as_completed)\n", + " 100 0.000 0.000 4.479 0.045 threading.py:563(wait)\n", + " 100 0.000 0.000 1.907 0.019 _base.py:418(result)\n", + " 20 0.000 0.000 1.907 0.095 _base.py:602(result_iterator)\n", + " 50 0.008 0.000 0.008 0.000 {built-in method numpy.zeros}\n", + " 10 0.004 0.000 0.005 0.000 impl.py:210()\n", + " 100 0.000 0.000 0.003 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.001 0.000 threading.py:880(start)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_large_100000.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "8044128e", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/00_cpu_1_thread_no_jit.ipynb b/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/00_cpu_1_thread_no_jit.ipynb new file mode 100644 index 00000000..81335340 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/00_cpu_1_thread_no_jit.ipynb @@ -0,0 +1,1483 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Multi-threading version of code in `09`" + ] + }, + { + "cell_type": "markdown", + "id": "84e7fec7", + "metadata": {}, + "source": [ + "## Disable Numba JIT" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "ff9b34c7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: NUMBA_DISABLE_JIT=1\n" + ] + } + ], + "source": [ + "%env NUMBA_DISABLE_JIT=1" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./libs/ccc/__pycache__\n", + "./libs/ccc/sklearn/__pycache__\n", + "./libs/ccc/scipy/__pycache__\n", + "./libs/ccc/coef/__pycache__\n", + "./libs/ccc/utils/__pycache__\n", + "./libs/ccc/pytorch/__pycache__\n" + ] + } + ], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from ccc.coef import ccc" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "c0341a42-b8de-419f-ab37-1e4fee9dde75", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.416790Z", + "iopub.status.busy": "2021-12-02T04:34:11.416353Z", + "iopub.status.idle": "2021-12-02T04:34:11.418488Z", + "shell.execute_reply": "2021-12-02T04:34:11.418030Z" + }, + "papermill": { + "duration": 0.098372, + "end_time": "2021-12-02T04:34:11.418578", + "exception": false, + "start_time": "2021-12-02T04:34:11.320206", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "np.random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "d63012be-4fc7-4fba-bccd-ad155905d1d6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.418916Z", + "iopub.status.busy": "2021-12-02T04:34:21.418477Z", + "iopub.status.idle": "2021-12-02T04:34:21.420269Z", + "shell.execute_reply": "2021-12-02T04:34:21.420631Z" + }, + "papermill": { + "duration": 0.098786, + "end_time": "2021-12-02T04:34:21.420745", + "exception": false, + "start_time": "2021-12-02T04:34:21.321959", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "40.1 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_50.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 6344 function calls (6334 primitive calls) in 0.043 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 125 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.043 0.043 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.043 0.043 :1()\n", + " 1 0.000 0.000 0.043 0.043 2661685993.py:1(func)\n", + " 10 0.000 0.000 0.043 0.004 impl.py:307(ccc)\n", + " 140 0.000 0.000 0.039 0.000 threading.py:280(wait)\n", + " 550 0.039 0.000 0.039 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.034 0.003 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.034 0.003 impl.py:485(cdist_func)\n", + " 70 0.000 0.000 0.034 0.000 threading.py:563(wait)\n", + " 10 0.000 0.000 0.034 0.003 impl.py:192(cdist_parts_parallel)\n", + " 70 0.000 0.000 0.033 0.000 _base.py:201(as_completed)\n", + " 70 0.000 0.000 0.006 0.000 _base.py:418(result)\n", + " 20 0.000 0.000 0.006 0.000 _base.py:602(result_iterator)\n", + " 70 0.000 0.000 0.003 0.000 thread.py:161(submit)\n", + " 70 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 threading.py:880(start)\n", + " 10 0.000 0.000 0.001 0.000 impl.py:210()\n", + " 10 0.000 0.000 0.000 0.000 _base.py:636(__exit__)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_50.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "121 ms ± 593 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_100.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8510 function calls (8500 primitive calls) in 0.126 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 125 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.126 0.126 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.126 0.126 :1()\n", + " 1 0.000 0.000 0.126 0.126 2661685993.py:1(func)\n", + " 10 0.000 0.000 0.126 0.013 impl.py:307(ccc)\n", + " 199 0.000 0.000 0.120 0.001 threading.py:280(wait)\n", + " 786 0.120 0.000 0.120 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.113 0.011 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.113 0.011 impl.py:485(cdist_func)\n", + " 10 0.000 0.000 0.112 0.011 impl.py:192(cdist_parts_parallel)\n", + " 99 0.000 0.000 0.111 0.001 threading.py:563(wait)\n", + " 100 0.000 0.000 0.110 0.001 _base.py:201(as_completed)\n", + " 100 0.000 0.000 0.010 0.000 _base.py:418(result)\n", + " 20 0.000 0.000 0.010 0.000 _base.py:602(result_iterator)\n", + " 100 0.000 0.000 0.003 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.001 0.000 threading.py:880(start)\n", + " 10 0.001 0.000 0.001 0.000 impl.py:210()\n", + " 100 0.000 0.000 0.000 0.000 threading.py:411(acquire)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_100.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "135 ms ± 532 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_500.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8534 function calls (8524 primitive calls) in 0.137 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 125 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.137 0.137 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.137 0.137 :1()\n", + " 1 0.000 0.000 0.137 0.137 2661685993.py:1(func)\n", + " 10 0.001 0.000 0.137 0.014 impl.py:307(ccc)\n", + " 200 0.000 0.000 0.131 0.001 threading.py:280(wait)\n", + " 790 0.131 0.000 0.131 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.122 0.012 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.122 0.012 impl.py:485(cdist_func)\n", + " 10 0.000 0.000 0.122 0.012 impl.py:192(cdist_parts_parallel)\n", + " 100 0.000 0.000 0.120 0.001 threading.py:563(wait)\n", + " 100 0.000 0.000 0.120 0.001 _base.py:201(as_completed)\n", + " 100 0.000 0.000 0.011 0.000 _base.py:418(result)\n", + " 20 0.000 0.000 0.011 0.001 _base.py:602(result_iterator)\n", + " 100 0.000 0.000 0.003 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 threading.py:880(start)\n", + " 10 0.000 0.000 0.001 0.000 impl.py:210()\n", + " 100 0.000 0.000 0.000 0.000 threading.py:411(acquire)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_500.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "154 ms ± 936 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_1000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8534 function calls (8524 primitive calls) in 0.155 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 125 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.155 0.155 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.155 0.155 :1()\n", + " 1 0.000 0.000 0.155 0.155 2661685993.py:1(func)\n", + " 10 0.001 0.000 0.154 0.015 impl.py:307(ccc)\n", + " 200 0.000 0.000 0.148 0.001 threading.py:280(wait)\n", + " 790 0.148 0.000 0.148 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.138 0.014 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.137 0.014 impl.py:485(cdist_func)\n", + " 10 0.000 0.000 0.137 0.014 impl.py:192(cdist_parts_parallel)\n", + " 100 0.000 0.000 0.135 0.001 threading.py:563(wait)\n", + " 100 0.000 0.000 0.135 0.001 _base.py:201(as_completed)\n", + " 100 0.000 0.000 0.013 0.000 _base.py:418(result)\n", + " 20 0.000 0.000 0.013 0.001 _base.py:602(result_iterator)\n", + " 100 0.000 0.000 0.003 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 threading.py:880(start)\n", + " 10 0.000 0.000 0.001 0.000 impl.py:210()\n", + " 100 0.000 0.000 0.000 0.000 threading.py:411(acquire)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_1000.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.17 s ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_large_50000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8534 function calls (8524 primitive calls) in 2.164 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 125 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 2.164 2.164 {built-in method builtins.exec}\n", + " 1 0.000 0.000 2.164 2.164 :1()\n", + " 1 0.000 0.000 2.164 2.164 2661685993.py:1(func)\n", + " 10 0.003 0.000 2.163 0.216 impl.py:307(ccc)\n", + " 200 0.001 0.000 2.139 0.011 threading.py:280(wait)\n", + " 790 2.138 0.003 2.138 0.003 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 1.479 0.148 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 1.477 0.148 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 1.477 0.148 impl.py:192(cdist_parts_parallel)\n", + " 100 0.001 0.000 1.470 0.015 _base.py:201(as_completed)\n", + " 100 0.000 0.000 1.468 0.015 threading.py:563(wait)\n", + " 100 0.000 0.000 0.672 0.007 _base.py:418(result)\n", + " 20 0.000 0.000 0.671 0.034 _base.py:602(result_iterator)\n", + " 50 0.007 0.000 0.007 0.000 {built-in method numpy.zeros}\n", + " 10 0.004 0.000 0.006 0.001 impl.py:210()\n", + " 100 0.000 0.000 0.003 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.001 0.000 threading.py:880(start)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_large_50000.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4.64 s ± 33 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_large_100000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8534 function calls (8524 primitive calls) in 4.658 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 125 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 4.658 4.658 {built-in method builtins.exec}\n", + " 1 0.000 0.000 4.658 4.658 :1()\n", + " 1 0.006 0.006 4.658 4.658 2661685993.py:1(func)\n", + " 10 0.004 0.000 4.652 0.465 impl.py:307(ccc)\n", + " 200 0.001 0.000 4.621 0.023 threading.py:280(wait)\n", + " 790 4.620 0.006 4.620 0.006 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 2.880 0.288 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 2.879 0.288 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 2.879 0.288 impl.py:192(cdist_parts_parallel)\n", + " 100 0.001 0.000 2.869 0.029 _base.py:201(as_completed)\n", + " 100 0.000 0.000 2.868 0.029 threading.py:563(wait)\n", + " 100 0.000 0.000 1.754 0.018 _base.py:418(result)\n", + " 20 0.000 0.000 1.753 0.088 _base.py:602(result_iterator)\n", + " 50 0.011 0.000 0.011 0.000 {built-in method numpy.zeros}\n", + " 10 0.006 0.001 0.007 0.001 impl.py:210()\n", + " 100 0.000 0.000 0.003 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.002 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.001 0.000 threading.py:880(start)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_large_100000.txt\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/01_cpu_8_threads.ipynb b/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/01_cpu_8_threads.ipynb new file mode 100644 index 00000000..12650b86 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/01_cpu_8_threads.ipynb @@ -0,0 +1,1469 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Multi-threading version of code in `09`" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./libs/ccc/__pycache__\n", + "./libs/ccc/sklearn/__pycache__\n", + "./libs/ccc/scipy/__pycache__\n", + "./libs/ccc/coef/__pycache__\n", + "./libs/ccc/utils/__pycache__\n", + "./libs/ccc/pytorch/__pycache__\n" + ] + } + ], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from ccc.coef import ccc" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c0341a42-b8de-419f-ab37-1e4fee9dde75", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.416790Z", + "iopub.status.busy": "2021-12-02T04:34:11.416353Z", + "iopub.status.idle": "2021-12-02T04:34:11.418488Z", + "shell.execute_reply": "2021-12-02T04:34:11.418030Z" + }, + "papermill": { + "duration": 0.098372, + "end_time": "2021-12-02T04:34:11.418578", + "exception": false, + "start_time": "2021-12-02T04:34:11.320206", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "np.random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d63012be-4fc7-4fba-bccd-ad155905d1d6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.418916Z", + "iopub.status.busy": "2021-12-02T04:34:21.418477Z", + "iopub.status.idle": "2021-12-02T04:34:21.420269Z", + "shell.execute_reply": "2021-12-02T04:34:21.420631Z" + }, + "papermill": { + "duration": 0.098786, + "end_time": "2021-12-02T04:34:21.420745", + "exception": false, + "start_time": "2021-12-02T04:34:21.321959", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=8)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14.6 ms ± 40 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_50.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8195 function calls in 0.018 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.018 0.018 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.018 0.018 :1()\n", + " 1 0.000 0.000 0.018 0.018 158102722.py:1(func)\n", + " 10 0.000 0.000 0.018 0.002 impl.py:307(ccc)\n", + " 10 0.000 0.000 0.013 0.001 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.012 0.001 impl.py:485(cdist_func)\n", + " 10 0.000 0.000 0.012 0.001 impl.py:192(cdist_parts_parallel)\n", + " 659 0.011 0.000 0.011 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 154 0.000 0.000 0.011 0.000 threading.py:280(wait)\n", + " 90 0.000 0.000 0.010 0.000 threading.py:563(wait)\n", + " 80 0.000 0.000 0.010 0.000 thread.py:161(submit)\n", + " 80 0.000 0.000 0.009 0.000 thread.py:180(_adjust_thread_count)\n", + " 57 0.000 0.000 0.008 0.000 threading.py:880(start)\n", + " 10 0.000 0.000 0.008 0.001 impl.py:210()\n", + " 70 0.000 0.000 0.004 0.000 _base.py:201(as_completed)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.001 0.000 _base.py:636(__exit__)\n", + " 10 0.000 0.000 0.001 0.000 thread.py:216(shutdown)\n", + " 57 0.001 0.000 0.001 0.000 {built-in method _thread.start_new_thread}" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_50.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24.9 ms ± 190 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_100.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 10901 function calls in 0.029 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 120 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.029 0.029 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.029 0.029 :1()\n", + " 1 0.000 0.000 0.029 0.029 158102722.py:1(func)\n", + " 10 0.000 0.000 0.028 0.003 impl.py:307(ccc)\n", + " 10 0.000 0.000 0.023 0.002 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.023 0.002 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 0.022 0.002 impl.py:192(cdist_parts_parallel)\n", + " 887 0.019 0.000 0.019 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 208 0.000 0.000 0.019 0.000 threading.py:280(wait)\n", + " 124 0.000 0.000 0.019 0.000 threading.py:563(wait)\n", + " 110 0.000 0.000 0.015 0.000 thread.py:161(submit)\n", + " 110 0.000 0.000 0.015 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.013 0.001 impl.py:210()\n", + " 75 0.000 0.000 0.013 0.000 threading.py:880(start)\n", + " 100 0.000 0.000 0.008 0.000 _base.py:201(as_completed)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 _base.py:636(__exit__)\n", + " 10 0.000 0.000 0.002 0.000 thread.py:216(shutdown)\n", + " 75 0.002 0.000 0.002 0.000 {built-in method _thread.start_new_thread}" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_100.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "29.3 ms ± 233 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_500.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 11112 function calls in 0.032 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.032 0.032 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.032 0.032 :1()\n", + " 1 0.000 0.000 0.032 0.032 158102722.py:1(func)\n", + " 10 0.000 0.000 0.031 0.003 impl.py:307(ccc)\n", + " 10 0.000 0.000 0.025 0.002 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.024 0.002 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 0.024 0.002 impl.py:192(cdist_parts_parallel)\n", + " 935 0.022 0.000 0.022 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 224 0.000 0.000 0.022 0.000 threading.py:280(wait)\n", + " 132 0.000 0.000 0.020 0.000 threading.py:563(wait)\n", + " 110 0.000 0.000 0.015 0.000 thread.py:161(submit)\n", + " 110 0.000 0.000 0.014 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.001 0.000 0.013 0.001 impl.py:210()\n", + " 75 0.000 0.000 0.012 0.000 threading.py:880(start)\n", + " 100 0.000 0.000 0.010 0.000 _base.py:201(as_completed)\n", + " 110 0.000 0.000 0.002 0.000 _base.py:418(result)\n", + " 30 0.000 0.000 0.002 0.000 _base.py:602(result_iterator)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 75 0.002 0.000 0.002 0.000 {built-in method _thread.start_new_thread}" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_500.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "34.7 ms ± 164 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_1000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 11853 function calls in 0.038 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.038 0.038 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.038 0.038 :1()\n", + " 1 0.000 0.000 0.038 0.038 158102722.py:1(func)\n", + " 10 0.001 0.000 0.038 0.004 impl.py:307(ccc)\n", + " 10 0.000 0.000 0.028 0.003 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.028 0.003 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 0.028 0.003 impl.py:192(cdist_parts_parallel)\n", + " 1051 0.028 0.000 0.028 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 253 0.000 0.000 0.027 0.000 threading.py:280(wait)\n", + " 150 0.000 0.000 0.023 0.000 threading.py:563(wait)\n", + " 110 0.000 0.000 0.016 0.000 thread.py:161(submit)\n", + " 110 0.000 0.000 0.015 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.001 0.000 0.014 0.001 impl.py:210()\n", + " 79 0.000 0.000 0.013 0.000 threading.py:880(start)\n", + " 100 0.000 0.000 0.013 0.000 _base.py:201(as_completed)\n", + " 110 0.000 0.000 0.005 0.000 _base.py:418(result)\n", + " 30 0.000 0.000 0.005 0.000 _base.py:602(result_iterator)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 79 0.002 0.000 0.002 0.000 {built-in method _thread.start_new_thread}" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_1000.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "967 ms ± 5.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_large_50000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 12363 function calls in 0.957 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.957 0.957 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.957 0.957 :1()\n", + " 1 0.009 0.009 0.957 0.957 158102722.py:1(func)\n", + " 10 0.013 0.001 0.949 0.095 impl.py:307(ccc)\n", + " 1148 0.917 0.001 0.917 0.001 {method 'acquire' of '_thread.lock' objects}\n", + " 274 0.001 0.000 0.914 0.003 threading.py:280(wait)\n", + " 10 0.000 0.000 0.473 0.047 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.472 0.047 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 0.472 0.047 impl.py:192(cdist_parts_parallel)\n", + " 170 0.000 0.000 0.464 0.003 threading.py:563(wait)\n", + " 100 0.001 0.000 0.455 0.005 _base.py:201(as_completed)\n", + " 110 0.000 0.000 0.450 0.004 _base.py:418(result)\n", + " 30 0.000 0.000 0.450 0.015 _base.py:602(result_iterator)\n", + " 110 0.000 0.000 0.016 0.000 thread.py:161(submit)\n", + " 10 0.002 0.000 0.015 0.002 impl.py:210()\n", + " 110 0.000 0.000 0.015 0.000 thread.py:180(_adjust_thread_count)\n", + " 80 0.000 0.000 0.013 0.000 threading.py:880(start)\n", + " 10 0.000 0.000 0.006 0.001 _base.py:636(__exit__)\n", + " 10 0.000 0.000 0.006 0.001 thread.py:216(shutdown)\n", + " 80 0.000 0.000 0.005 0.000 threading.py:1028(join)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_large_50000.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.96 s ± 3.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_large_100000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 12320 function calls in 1.962 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 1.962 1.962 {built-in method builtins.exec}\n", + " 1 0.000 0.000 1.962 1.962 :1()\n", + " 1 0.014 0.014 1.962 1.962 158102722.py:1(func)\n", + " 10 0.021 0.002 1.948 0.195 impl.py:307(ccc)\n", + " 1142 1.898 0.002 1.898 0.002 {method 'acquire' of '_thread.lock' objects}\n", + " 271 0.001 0.000 1.896 0.007 threading.py:280(wait)\n", + " 10 0.000 0.000 0.962 0.096 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.962 0.096 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 0.961 0.096 impl.py:192(cdist_parts_parallel)\n", + " 170 0.000 0.000 0.952 0.006 threading.py:563(wait)\n", + " 110 0.000 0.000 0.945 0.009 _base.py:418(result)\n", + " 30 0.000 0.000 0.945 0.031 _base.py:602(result_iterator)\n", + " 100 0.001 0.000 0.941 0.009 _base.py:201(as_completed)\n", + " 10 0.004 0.000 0.018 0.002 impl.py:210()\n", + " 110 0.000 0.000 0.018 0.000 thread.py:161(submit)\n", + " 110 0.000 0.000 0.017 0.000 thread.py:180(_adjust_thread_count)\n", + " 80 0.000 0.000 0.015 0.000 threading.py:880(start)\n", + " 50 0.012 0.000 0.012 0.000 {built-in method numpy.zeros}\n", + " 10 0.000 0.000 0.004 0.000 _base.py:636(__exit__)\n", + " 10 0.000 0.000 0.004 0.000 thread.py:216(shutdown)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_large_100000.txt\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/01_cpu_8_threads_no_jit.ipynb b/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/01_cpu_8_threads_no_jit.ipynb new file mode 100644 index 00000000..ac0ebe48 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/00_cpu_version_ref/01_cpu_8_threads_no_jit.ipynb @@ -0,0 +1,1470 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Multi-threading version of code in `09`" + ] + }, + { + "cell_type": "markdown", + "id": "56d47188", + "metadata": {}, + "source": [ + "## Disable Numba JIT" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "af00ffad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: NUMBA_DISABLE_JIT=1\n" + ] + } + ], + "source": [ + "%env NUMBA_DISABLE_JIT=1" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from ccc.coef import ccc\n" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c0341a42-b8de-419f-ab37-1e4fee9dde75", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.416790Z", + "iopub.status.busy": "2021-12-02T04:34:11.416353Z", + "iopub.status.idle": "2021-12-02T04:34:11.418488Z", + "shell.execute_reply": "2021-12-02T04:34:11.418030Z" + }, + "papermill": { + "duration": 0.098372, + "end_time": "2021-12-02T04:34:11.418578", + "exception": false, + "start_time": "2021-12-02T04:34:11.320206", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "np.random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d63012be-4fc7-4fba-bccd-ad155905d1d6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.418916Z", + "iopub.status.busy": "2021-12-02T04:34:21.418477Z", + "iopub.status.idle": "2021-12-02T04:34:21.420269Z", + "shell.execute_reply": "2021-12-02T04:34:21.420631Z" + }, + "papermill": { + "duration": 0.098786, + "end_time": "2021-12-02T04:34:21.420745", + "exception": false, + "start_time": "2021-12-02T04:34:21.321959", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=8)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "60.5 ms ± 529 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_50.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8093 function calls (8083 primitive calls) in 0.066 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 136 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.066 0.066 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.066 0.066 :1()\n", + " 1 0.000 0.000 0.066 0.066 158102722.py:1(func)\n", + " 10 0.000 0.000 0.066 0.007 impl.py:307(ccc)\n", + " 614 0.059 0.000 0.059 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 146 0.000 0.000 0.059 0.000 threading.py:280(wait)\n", + " 80 0.000 0.000 0.053 0.001 threading.py:563(wait)\n", + " 10 0.000 0.000 0.053 0.005 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.052 0.005 impl.py:485(cdist_func)\n", + " 10 0.000 0.000 0.052 0.005 impl.py:192(cdist_parts_parallel)\n", + " 80 0.000 0.000 0.034 0.000 thread.py:161(submit)\n", + " 80 0.000 0.000 0.033 0.000 thread.py:180(_adjust_thread_count)\n", + " 54 0.000 0.000 0.032 0.001 threading.py:880(start)\n", + " 10 0.000 0.000 0.029 0.003 impl.py:210()\n", + " 70 0.000 0.000 0.023 0.000 _base.py:201(as_completed)\n", + " 80 0.000 0.000 0.006 0.000 _base.py:418(result)\n", + " 30 0.000 0.000 0.006 0.000 _base.py:602(result_iterator)\n", + " 10 0.000 0.000 0.005 0.001 _base.py:573(map)\n", + " 10 0.000 0.000 0.005 0.001 _base.py:598()\n", + " 10 0.000 0.000 0.001 0.000 _base.py:636(__exit__)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_50.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "167 ms ± 773 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_100.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 11003 function calls (10993 primitive calls) in 0.172 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 125 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.172 0.172 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.172 0.172 :1()\n", + " 1 0.000 0.000 0.172 0.172 158102722.py:1(func)\n", + " 10 0.000 0.000 0.172 0.017 impl.py:307(ccc)\n", + " 879 0.162 0.000 0.162 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 207 0.000 0.000 0.161 0.001 threading.py:280(wait)\n", + " 10 0.000 0.000 0.154 0.015 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.153 0.015 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 0.153 0.015 impl.py:192(cdist_parts_parallel)\n", + " 120 0.000 0.000 0.151 0.001 threading.py:563(wait)\n", + " 100 0.000 0.000 0.084 0.001 _base.py:201(as_completed)\n", + " 110 0.000 0.000 0.073 0.001 thread.py:161(submit)\n", + " 110 0.000 0.000 0.072 0.001 thread.py:180(_adjust_thread_count)\n", + " 75 0.000 0.000 0.070 0.001 threading.py:880(start)\n", + " 10 0.000 0.000 0.069 0.007 impl.py:210()\n", + " 110 0.000 0.000 0.010 0.000 _base.py:418(result)\n", + " 30 0.000 0.000 0.010 0.000 _base.py:602(result_iterator)\n", + " 10 0.000 0.000 0.004 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.004 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.003 0.000 _base.py:636(__exit__)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_100.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "184 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_500.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 12028 function calls (12018 primitive calls) in 0.187 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 125 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.187 0.187 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.187 0.187 :1()\n", + " 1 0.000 0.000 0.187 0.187 158102722.py:1(func)\n", + " 10 0.001 0.000 0.187 0.019 impl.py:307(ccc)\n", + " 1054 0.174 0.000 0.174 0.000 {method 'acquire' of '_thread.lock' objects}\n", + " 252 0.000 0.000 0.173 0.001 threading.py:280(wait)\n", + " 10 0.000 0.000 0.166 0.017 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.166 0.017 impl.py:485(cdist_func)\n", + " 10 0.003 0.000 0.165 0.017 impl.py:192(cdist_parts_parallel)\n", + " 146 0.000 0.000 0.161 0.001 threading.py:563(wait)\n", + " 100 0.000 0.000 0.122 0.001 _base.py:201(as_completed)\n", + " 110 0.000 0.000 0.045 0.000 thread.py:161(submit)\n", + " 110 0.000 0.000 0.044 0.000 thread.py:180(_adjust_thread_count)\n", + " 80 0.000 0.000 0.042 0.001 threading.py:880(start)\n", + " 10 0.000 0.000 0.041 0.004 impl.py:210()\n", + " 110 0.000 0.000 0.013 0.000 _base.py:418(result)\n", + " 30 0.000 0.000 0.013 0.000 _base.py:602(result_iterator)\n", + " 10 0.000 0.000 0.005 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.005 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 _base.py:636(__exit__)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_500.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "581 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_1000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 12578 function calls (12568 primitive calls) in 0.596 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 125 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.596 0.596 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.596 0.596 :1()\n", + " 1 0.000 0.000 0.596 0.596 158102722.py:1(func)\n", + " 10 0.001 0.000 0.596 0.060 impl.py:307(ccc)\n", + " 1150 0.581 0.001 0.581 0.001 {method 'acquire' of '_thread.lock' objects}\n", + " 275 0.001 0.000 0.579 0.002 threading.py:280(wait)\n", + " 10 0.000 0.000 0.570 0.057 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.570 0.057 impl.py:485(cdist_func)\n", + " 10 0.004 0.000 0.570 0.057 impl.py:192(cdist_parts_parallel)\n", + " 170 0.000 0.000 0.562 0.003 threading.py:563(wait)\n", + " 100 0.001 0.000 0.540 0.005 _base.py:201(as_completed)\n", + " 110 0.000 0.000 0.028 0.000 thread.py:161(submit)\n", + " 110 0.000 0.000 0.027 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.026 0.003 impl.py:210()\n", + " 80 0.000 0.000 0.025 0.000 threading.py:880(start)\n", + " 110 0.000 0.000 0.018 0.000 _base.py:418(result)\n", + " 30 0.000 0.000 0.018 0.001 _base.py:602(result_iterator)\n", + " 10 0.000 0.000 0.003 0.000 _base.py:636(__exit__)\n", + " 10 0.000 0.000 0.003 0.000 thread.py:216(shutdown)\n", + " 80 0.000 0.000 0.003 0.000 threading.py:1028(join)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_1000.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.33 s ± 6.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_large_50000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 12538 function calls (12528 primitive calls) in 1.339 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 125 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 1.339 1.339 {built-in method builtins.exec}\n", + " 1 0.000 0.000 1.339 1.339 :1()\n", + " 1 0.000 0.000 1.339 1.339 158102722.py:1(func)\n", + " 10 0.002 0.000 1.338 0.134 impl.py:307(ccc)\n", + " 1144 1.307 0.001 1.307 0.001 {method 'acquire' of '_thread.lock' objects}\n", + " 273 0.001 0.000 1.305 0.005 threading.py:280(wait)\n", + " 10 0.000 0.000 0.869 0.087 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.868 0.087 impl.py:485(cdist_func)\n", + " 10 0.005 0.000 0.868 0.087 impl.py:192(cdist_parts_parallel)\n", + " 170 0.000 0.000 0.854 0.005 threading.py:563(wait)\n", + " 100 0.001 0.000 0.843 0.008 _base.py:201(as_completed)\n", + " 110 0.000 0.000 0.452 0.004 _base.py:418(result)\n", + " 30 0.000 0.000 0.452 0.015 _base.py:602(result_iterator)\n", + " 10 0.005 0.000 0.020 0.002 impl.py:210()\n", + " 110 0.000 0.000 0.017 0.000 thread.py:161(submit)\n", + " 110 0.000 0.000 0.016 0.000 thread.py:180(_adjust_thread_count)\n", + " 80 0.000 0.000 0.014 0.000 threading.py:880(start)\n", + " 50 0.007 0.000 0.007 0.000 {built-in method numpy.zeros}\n", + " 10 0.000 0.000 0.005 0.000 _base.py:636(__exit__)\n", + " 10 0.000 0.000 0.005 0.000 thread.py:216(shutdown)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_large_50000.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.06 s ± 6.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_large_100000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 12526 function calls (12516 primitive calls) in 2.065 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 125 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 2.065 2.065 {built-in method builtins.exec}\n", + " 1 0.000 0.000 2.064 2.064 :1()\n", + " 1 0.000 0.000 2.064 2.064 158102722.py:1(func)\n", + " 10 0.004 0.000 2.064 0.206 impl.py:307(ccc)\n", + " 1142 2.024 0.002 2.024 0.002 {method 'acquire' of '_thread.lock' objects}\n", + " 271 0.001 0.000 2.022 0.007 threading.py:280(wait)\n", + " 10 0.000 0.000 1.111 0.111 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 1.110 0.111 impl.py:485(cdist_func)\n", + " 10 0.003 0.000 1.110 0.111 impl.py:192(cdist_parts_parallel)\n", + " 170 0.000 0.000 1.095 0.006 threading.py:563(wait)\n", + " 100 0.001 0.000 1.085 0.011 _base.py:201(as_completed)\n", + " 110 0.000 0.000 0.928 0.008 _base.py:418(result)\n", + " 30 0.000 0.000 0.927 0.031 _base.py:602(result_iterator)\n", + " 10 0.006 0.001 0.021 0.002 impl.py:210()\n", + " 110 0.000 0.000 0.017 0.000 thread.py:161(submit)\n", + " 110 0.000 0.000 0.016 0.000 thread.py:180(_adjust_thread_count)\n", + " 80 0.000 0.000 0.014 0.000 threading.py:880(start)\n", + " 50 0.013 0.000 0.013 0.000 {built-in method numpy.zeros}\n", + " 10 0.000 0.000 0.005 0.000 _base.py:636(__exit__)\n", + " 10 0.000 0.000 0.005 0.000 thread.py:216(shutdown)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_large_100000.txt\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_large_100000.txt b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_large_100000.txt new file mode 100644 index 00000000..a522db21 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_large_100000.txt @@ -0,0 +1,26 @@ + 8334 function calls in 7.515 seconds + + Ordered by: cumulative time + List reduced from 114 to 20 due to restriction <20> + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 7.515 7.515 {built-in method builtins.exec} + 1 0.000 0.000 7.515 7.515 :1() + 1 0.000 0.000 7.515 7.515 2661685993.py:1(func) + 10 0.008 0.001 7.515 0.751 impl.py:307(ccc) + 200 0.001 0.000 7.490 0.037 threading.py:280(wait) + 790 7.489 0.009 7.489 0.009 {method 'acquire' of '_thread.lock' objects} + 10 0.001 0.000 5.731 0.573 impl.py:492(compute_coef) + 10 0.000 0.000 5.730 0.573 impl.py:485(cdist_func) + 10 0.001 0.000 5.729 0.573 impl.py:192(cdist_parts_parallel) + 100 0.001 0.000 5.724 0.057 _base.py:201(as_completed) + 100 0.000 0.000 5.723 0.057 threading.py:563(wait) + 100 0.000 0.000 1.767 0.018 _base.py:418(result) + 20 0.000 0.000 1.767 0.088 _base.py:602(result_iterator) + 50 0.005 0.000 0.005 0.000 {built-in method numpy.zeros} + 100 0.000 0.000 0.004 0.000 thread.py:161(submit) + 10 0.002 0.000 0.003 0.000 impl.py:210() + 100 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count) + 10 0.000 0.000 0.002 0.000 _base.py:573(map) + 10 0.000 0.000 0.002 0.000 _base.py:598() + 10 0.000 0.000 0.002 0.000 threading.py:880(start) \ No newline at end of file diff --git a/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_large_50000.txt b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_large_50000.txt new file mode 100644 index 00000000..e5ba8949 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_large_50000.txt @@ -0,0 +1,26 @@ + 8334 function calls in 4.119 seconds + + Ordered by: cumulative time + List reduced from 114 to 20 due to restriction <20> + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 4.119 4.119 {built-in method builtins.exec} + 1 0.000 0.000 4.119 4.119 :1() + 1 0.012 0.012 4.119 4.119 2661685993.py:1(func) + 10 0.005 0.000 4.107 0.411 impl.py:307(ccc) + 200 0.001 0.000 4.088 0.020 threading.py:280(wait) + 790 4.087 0.005 4.087 0.005 {method 'acquire' of '_thread.lock' objects} + 10 0.001 0.000 3.239 0.324 impl.py:492(compute_coef) + 10 0.000 0.000 3.238 0.324 impl.py:485(cdist_func) + 10 0.002 0.000 3.237 0.324 impl.py:192(cdist_parts_parallel) + 100 0.001 0.000 3.232 0.032 _base.py:201(as_completed) + 100 0.000 0.000 3.231 0.032 threading.py:563(wait) + 100 0.000 0.000 0.857 0.009 _base.py:418(result) + 20 0.000 0.000 0.857 0.043 _base.py:602(result_iterator) + 100 0.000 0.000 0.004 0.000 thread.py:161(submit) + 10 0.002 0.000 0.003 0.000 impl.py:210() + 100 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count) + 50 0.003 0.000 0.003 0.000 {built-in method numpy.zeros} + 10 0.000 0.000 0.002 0.000 _base.py:573(map) + 10 0.000 0.000 0.002 0.000 _base.py:598() + 10 0.000 0.000 0.002 0.000 threading.py:880(start) \ No newline at end of file diff --git a/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_100.txt b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_100.txt new file mode 100644 index 00000000..a1d578c2 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_100.txt @@ -0,0 +1,26 @@ + 8334 function calls in 0.759 seconds + + Ordered by: cumulative time + List reduced from 114 to 20 due to restriction <20> + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 0.759 0.759 {built-in method builtins.exec} + 1 0.000 0.000 0.759 0.759 :1() + 1 0.000 0.000 0.759 0.759 2661685993.py:1(func) + 10 0.001 0.000 0.759 0.076 impl.py:307(ccc) + 10 0.000 0.000 0.751 0.075 impl.py:492(compute_coef) + 10 0.000 0.000 0.750 0.075 impl.py:485(cdist_func) + 10 0.002 0.000 0.750 0.075 impl.py:192(cdist_parts_parallel) + 200 0.001 0.000 0.747 0.004 threading.py:280(wait) + 100 0.001 0.000 0.746 0.007 _base.py:201(as_completed) + 790 0.746 0.001 0.746 0.001 {method 'acquire' of '_thread.lock' objects} + 100 0.000 0.000 0.745 0.007 threading.py:563(wait) + 100 0.000 0.000 0.004 0.000 thread.py:161(submit) + 100 0.000 0.000 0.003 0.000 _base.py:418(result) + 100 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count) + 20 0.000 0.000 0.003 0.000 _base.py:602(result_iterator) + 10 0.000 0.000 0.003 0.000 _base.py:573(map) + 10 0.000 0.000 0.003 0.000 _base.py:598() + 10 0.000 0.000 0.002 0.000 threading.py:880(start) + 10 0.001 0.000 0.002 0.000 impl.py:210() + 190 0.001 0.000 0.001 0.000 _base.py:179(_yield_finished_futures) \ No newline at end of file diff --git a/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_1000.txt b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_1000.txt new file mode 100644 index 00000000..7c3d8e70 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_1000.txt @@ -0,0 +1,26 @@ + 8334 function calls in 0.812 seconds + + Ordered by: cumulative time + List reduced from 114 to 20 due to restriction <20> + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 0.812 0.812 {built-in method builtins.exec} + 1 0.000 0.000 0.812 0.812 :1() + 1 0.000 0.000 0.812 0.812 2661685993.py:1(func) + 10 0.001 0.000 0.812 0.081 impl.py:307(ccc) + 200 0.001 0.000 0.801 0.004 threading.py:280(wait) + 790 0.801 0.001 0.801 0.001 {method 'acquire' of '_thread.lock' objects} + 10 0.000 0.000 0.796 0.080 impl.py:492(compute_coef) + 10 0.000 0.000 0.796 0.080 impl.py:485(cdist_func) + 10 0.002 0.000 0.795 0.080 impl.py:192(cdist_parts_parallel) + 100 0.001 0.000 0.792 0.008 _base.py:201(as_completed) + 100 0.000 0.000 0.791 0.008 threading.py:563(wait) + 100 0.000 0.000 0.011 0.000 _base.py:418(result) + 20 0.000 0.000 0.011 0.001 _base.py:602(result_iterator) + 100 0.000 0.000 0.003 0.000 thread.py:161(submit) + 100 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count) + 10 0.000 0.000 0.002 0.000 _base.py:573(map) + 10 0.000 0.000 0.002 0.000 _base.py:598() + 10 0.000 0.000 0.002 0.000 threading.py:880(start) + 10 0.000 0.000 0.001 0.000 impl.py:210() + 190 0.001 0.000 0.001 0.000 _base.py:179(_yield_finished_futures) \ No newline at end of file diff --git a/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_50.txt b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_50.txt new file mode 100644 index 00000000..7f895ad5 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_50.txt @@ -0,0 +1,26 @@ + 6144 function calls in 0.346 seconds + + Ordered by: cumulative time + List reduced from 114 to 20 due to restriction <20> + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 0.346 0.346 {built-in method builtins.exec} + 1 0.000 0.000 0.346 0.346 :1() + 1 0.000 0.000 0.346 0.346 2661685993.py:1(func) + 10 0.001 0.000 0.345 0.035 impl.py:307(ccc) + 10 0.000 0.000 0.340 0.034 impl.py:492(compute_coef) + 10 0.000 0.000 0.339 0.034 impl.py:485(cdist_func) + 10 0.001 0.000 0.339 0.034 impl.py:192(cdist_parts_parallel) + 140 0.001 0.000 0.337 0.002 threading.py:280(wait) + 550 0.336 0.001 0.336 0.001 {method 'acquire' of '_thread.lock' objects} + 70 0.001 0.000 0.336 0.005 _base.py:201(as_completed) + 70 0.000 0.000 0.336 0.005 threading.py:563(wait) + 70 0.000 0.000 0.003 0.000 thread.py:161(submit) + 70 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count) + 10 0.000 0.000 0.003 0.000 _base.py:573(map) + 10 0.000 0.000 0.003 0.000 _base.py:598() + 10 0.000 0.000 0.002 0.000 threading.py:880(start) + 70 0.000 0.000 0.001 0.000 _base.py:418(result) + 20 0.000 0.000 0.001 0.000 _base.py:602(result_iterator) + 10 0.000 0.000 0.001 0.000 impl.py:210() + 10 0.000 0.000 0.001 0.000 _base.py:636(__exit__) \ No newline at end of file diff --git a/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_500.txt b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_500.txt new file mode 100644 index 00000000..b7ab5d10 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00-n_samples_small_500.txt @@ -0,0 +1,26 @@ + 8334 function calls in 0.786 seconds + + Ordered by: cumulative time + List reduced from 114 to 20 due to restriction <20> + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 0.786 0.786 {built-in method builtins.exec} + 1 0.000 0.000 0.786 0.786 :1() + 1 0.000 0.000 0.786 0.786 2661685993.py:1(func) + 10 0.001 0.000 0.786 0.079 impl.py:307(ccc) + 200 0.001 0.000 0.775 0.004 threading.py:280(wait) + 790 0.774 0.001 0.774 0.001 {method 'acquire' of '_thread.lock' objects} + 10 0.000 0.000 0.774 0.077 impl.py:492(compute_coef) + 10 0.000 0.000 0.773 0.077 impl.py:485(cdist_func) + 10 0.002 0.000 0.773 0.077 impl.py:192(cdist_parts_parallel) + 100 0.001 0.000 0.770 0.008 _base.py:201(as_completed) + 100 0.000 0.000 0.769 0.008 threading.py:563(wait) + 100 0.000 0.000 0.008 0.000 _base.py:418(result) + 20 0.000 0.000 0.007 0.000 _base.py:602(result_iterator) + 100 0.000 0.000 0.003 0.000 thread.py:161(submit) + 100 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count) + 10 0.000 0.000 0.002 0.000 _base.py:573(map) + 10 0.000 0.000 0.002 0.000 _base.py:598() + 10 0.000 0.000 0.002 0.000 threading.py:880(start) + 10 0.000 0.000 0.001 0.000 impl.py:210() + 190 0.001 0.000 0.001 0.000 _base.py:179(_yield_finished_futures) \ No newline at end of file diff --git a/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00_cuda_ari.ipynb b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00_cuda_ari.ipynb new file mode 100644 index 00000000..2c1769ed --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/01_ari_cuda_v0/00_cuda_ari.ipynb @@ -0,0 +1,1595 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Multi-threading version of code in `09`" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2024-06-04 09:14:42,188 - numba.cuda.cudadrv.driver] INFO: init\n", + "[2024-06-04 09:14:42,300 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,301 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,301 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,301 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,302 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-04 09:14:42,374 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-04 09:14:42,374 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,375 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,376 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,376 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,376 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,376 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,376 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,376 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,377 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,377 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-04 09:14:42,377 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-04 09:14:42,377 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,377 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,377 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,378 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,378 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,378 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,379 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,379 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,380 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,381 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,381 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,381 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,381 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,382 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,382 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,383 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,383 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,383 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,384 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,384 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,384 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,384 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,384 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,384 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,385 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,385 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,385 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,385 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,385 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,385 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,386 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-04 09:14:42,387 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-04 09:14:42,387 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "from ccc.coef import ccc" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c0341a42-b8de-419f-ab37-1e4fee9dde75", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.416790Z", + "iopub.status.busy": "2021-12-02T04:34:11.416353Z", + "iopub.status.idle": "2021-12-02T04:34:11.418488Z", + "shell.execute_reply": "2021-12-02T04:34:11.418030Z" + }, + "papermill": { + "duration": 0.098372, + "end_time": "2021-12-02T04:34:11.418578", + "exception": false, + "start_time": "2021-12-02T04:34:11.320206", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "np.random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2024-06-04 09:14:42,636 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,637 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,637 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,637 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,638 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-04 09:14:42,638 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,638 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,638 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,639 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-04 09:14:42,639 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-04 09:14:42,639 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,639 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,639 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,639 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,640 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,640 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-04 09:14:42,641 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-04 09:14:42,641 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,643 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,643 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,643 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,643 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,643 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,644 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,644 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,645 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,646 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,646 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-04 09:14:42,646 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,646 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,647 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,647 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,647 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,647 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,647 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,648 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,648 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-04 09:14:42,648 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,648 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,648 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,649 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,649 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,650 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,651 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,651 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,651 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,651 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,651 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-04 09:14:42,652 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-04 09:14:42,652 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,652 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,653 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,653 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-04 09:14:42,653 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-04 09:14:42,653 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,653 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-04 09:14:42,654 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,654 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-04 09:14:42,654 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-04 09:14:42,654 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-04 09:14:42,654 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n" + ] + }, + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "dfbd22fe", + "metadata": {}, + "outputs": [], + "source": [ + "# Disable numba logging\n", + "import logging\n", + "\n", + "numba_logger = logging.getLogger('numba')\n", + "numba_logger.setLevel(logging.WARNING)" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d63012be-4fc7-4fba-bccd-ad155905d1d6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.418916Z", + "iopub.status.busy": "2021-12-02T04:34:21.418477Z", + "iopub.status.idle": "2021-12-02T04:34:21.420269Z", + "shell.execute_reply": "2021-12-02T04:34:21.420631Z" + }, + "papermill": { + "duration": 0.098786, + "end_time": "2021-12-02T04:34:21.420745", + "exception": false, + "start_time": "2021-12-02T04:34:21.321959", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "341 ms ± 1.26 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_50.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 6144 function calls in 0.346 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.346 0.346 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.346 0.346 :1()\n", + " 1 0.000 0.000 0.346 0.346 2661685993.py:1(func)\n", + " 10 0.001 0.000 0.345 0.035 impl.py:307(ccc)\n", + " 10 0.000 0.000 0.340 0.034 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.339 0.034 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 0.339 0.034 impl.py:192(cdist_parts_parallel)\n", + " 140 0.001 0.000 0.337 0.002 threading.py:280(wait)\n", + " 550 0.336 0.001 0.336 0.001 {method 'acquire' of '_thread.lock' objects}\n", + " 70 0.001 0.000 0.336 0.005 _base.py:201(as_completed)\n", + " 70 0.000 0.000 0.336 0.005 threading.py:563(wait)\n", + " 70 0.000 0.000 0.003 0.000 thread.py:161(submit)\n", + " 70 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.003 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.003 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 threading.py:880(start)\n", + " 70 0.000 0.000 0.001 0.000 _base.py:418(result)\n", + " 20 0.000 0.000 0.001 0.000 _base.py:602(result_iterator)\n", + " 10 0.000 0.000 0.001 0.000 impl.py:210()\n", + " 10 0.000 0.000 0.001 0.000 _base.py:636(__exit__)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_50.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "754 ms ± 4.44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_100.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8334 function calls in 0.759 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.759 0.759 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.759 0.759 :1()\n", + " 1 0.000 0.000 0.759 0.759 2661685993.py:1(func)\n", + " 10 0.001 0.000 0.759 0.076 impl.py:307(ccc)\n", + " 10 0.000 0.000 0.751 0.075 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.750 0.075 impl.py:485(cdist_func)\n", + " 10 0.002 0.000 0.750 0.075 impl.py:192(cdist_parts_parallel)\n", + " 200 0.001 0.000 0.747 0.004 threading.py:280(wait)\n", + " 100 0.001 0.000 0.746 0.007 _base.py:201(as_completed)\n", + " 790 0.746 0.001 0.746 0.001 {method 'acquire' of '_thread.lock' objects}\n", + " 100 0.000 0.000 0.745 0.007 threading.py:563(wait)\n", + " 100 0.000 0.000 0.004 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.003 0.000 _base.py:418(result)\n", + " 100 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count)\n", + " 20 0.000 0.000 0.003 0.000 _base.py:602(result_iterator)\n", + " 10 0.000 0.000 0.003 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.003 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 threading.py:880(start)\n", + " 10 0.001 0.000 0.002 0.000 impl.py:210()\n", + " 190 0.001 0.000 0.001 0.000 _base.py:179(_yield_finished_futures)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_100.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "770 ms ± 3.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_500.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8334 function calls in 0.786 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.786 0.786 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.786 0.786 :1()\n", + " 1 0.000 0.000 0.786 0.786 2661685993.py:1(func)\n", + " 10 0.001 0.000 0.786 0.079 impl.py:307(ccc)\n", + " 200 0.001 0.000 0.775 0.004 threading.py:280(wait)\n", + " 790 0.774 0.001 0.774 0.001 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.774 0.077 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.773 0.077 impl.py:485(cdist_func)\n", + " 10 0.002 0.000 0.773 0.077 impl.py:192(cdist_parts_parallel)\n", + " 100 0.001 0.000 0.770 0.008 _base.py:201(as_completed)\n", + " 100 0.000 0.000 0.769 0.008 threading.py:563(wait)\n", + " 100 0.000 0.000 0.008 0.000 _base.py:418(result)\n", + " 20 0.000 0.000 0.007 0.000 _base.py:602(result_iterator)\n", + " 100 0.000 0.000 0.003 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 threading.py:880(start)\n", + " 10 0.000 0.000 0.001 0.000 impl.py:210()\n", + " 190 0.001 0.000 0.001 0.000 _base.py:179(_yield_finished_futures)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_500.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "802 ms ± 2.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_small_1000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8334 function calls in 0.812 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.812 0.812 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.812 0.812 :1()\n", + " 1 0.000 0.000 0.812 0.812 2661685993.py:1(func)\n", + " 10 0.001 0.000 0.812 0.081 impl.py:307(ccc)\n", + " 200 0.001 0.000 0.801 0.004 threading.py:280(wait)\n", + " 790 0.801 0.001 0.801 0.001 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.796 0.080 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 0.796 0.080 impl.py:485(cdist_func)\n", + " 10 0.002 0.000 0.795 0.080 impl.py:192(cdist_parts_parallel)\n", + " 100 0.001 0.000 0.792 0.008 _base.py:201(as_completed)\n", + " 100 0.000 0.000 0.791 0.008 threading.py:563(wait)\n", + " 100 0.000 0.000 0.011 0.000 _base.py:418(result)\n", + " 20 0.000 0.000 0.011 0.001 _base.py:602(result_iterator)\n", + " 100 0.000 0.000 0.003 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 threading.py:880(start)\n", + " 10 0.000 0.000 0.001 0.000 impl.py:210()\n", + " 190 0.001 0.000 0.001 0.000 _base.py:179(_yield_finished_futures)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_small_1000.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4.13 s ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_large_50000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8334 function calls in 4.119 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 4.119 4.119 {built-in method builtins.exec}\n", + " 1 0.000 0.000 4.119 4.119 :1()\n", + " 1 0.012 0.012 4.119 4.119 2661685993.py:1(func)\n", + " 10 0.005 0.000 4.107 0.411 impl.py:307(ccc)\n", + " 200 0.001 0.000 4.088 0.020 threading.py:280(wait)\n", + " 790 4.087 0.005 4.087 0.005 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.001 0.000 3.239 0.324 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 3.238 0.324 impl.py:485(cdist_func)\n", + " 10 0.002 0.000 3.237 0.324 impl.py:192(cdist_parts_parallel)\n", + " 100 0.001 0.000 3.232 0.032 _base.py:201(as_completed)\n", + " 100 0.000 0.000 3.231 0.032 threading.py:563(wait)\n", + " 100 0.000 0.000 0.857 0.009 _base.py:418(result)\n", + " 20 0.000 0.000 0.857 0.043 _base.py:602(result_iterator)\n", + " 100 0.000 0.000 0.004 0.000 thread.py:161(submit)\n", + " 10 0.002 0.000 0.003 0.000 impl.py:210()\n", + " 100 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count)\n", + " 50 0.003 0.000 0.003 0.000 {built-in method numpy.zeros}\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 threading.py:880(start)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_large_50000.txt\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.51 s ± 11.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '09-n_samples_large_100000.txt'. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8334 function calls in 7.515 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 114 to 20 due to restriction <20>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 7.515 7.515 {built-in method builtins.exec}\n", + " 1 0.000 0.000 7.515 7.515 :1()\n", + " 1 0.000 0.000 7.515 7.515 2661685993.py:1(func)\n", + " 10 0.008 0.001 7.515 0.751 impl.py:307(ccc)\n", + " 200 0.001 0.000 7.490 0.037 threading.py:280(wait)\n", + " 790 7.489 0.009 7.489 0.009 {method 'acquire' of '_thread.lock' objects}\n", + " 10 0.001 0.000 5.731 0.573 impl.py:492(compute_coef)\n", + " 10 0.000 0.000 5.730 0.573 impl.py:485(cdist_func)\n", + " 10 0.001 0.000 5.729 0.573 impl.py:192(cdist_parts_parallel)\n", + " 100 0.001 0.000 5.724 0.057 _base.py:201(as_completed)\n", + " 100 0.000 0.000 5.723 0.057 threading.py:563(wait)\n", + " 100 0.000 0.000 1.767 0.018 _base.py:418(result)\n", + " 20 0.000 0.000 1.767 0.088 _base.py:602(result_iterator)\n", + " 50 0.005 0.000 0.005 0.000 {built-in method numpy.zeros}\n", + " 100 0.000 0.000 0.004 0.000 thread.py:161(submit)\n", + " 10 0.002 0.000 0.003 0.000 impl.py:210()\n", + " 100 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 10 0.000 0.000 0.002 0.000 threading.py:880(start)" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20 -T 09-n_samples_large_100000.txt\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_12_core_pyinstrument.ipynb b/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_12_core_pyinstrument.ipynb new file mode 100644 index 00000000..105762a9 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_12_core_pyinstrument.ipynb @@ -0,0 +1,1472 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Single-threaded version, force to use `cdist_parts_basic` to see the overhead of `air` " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5c010e62", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: CM_N_JOBS=1\n" + ] + } + ], + "source": [ + "# Set core count to 1\n", + "# %env CM_N_JOBS=1" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/sklearn/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/scipy/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/coef/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/utils/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/pytorch/__pycache__\n" + ] + } + ], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from ccc.coef import ccc\n", + "%load_ext pyinstrument" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10\n", + "np.random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bc5236a7", + "metadata": {}, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=12)" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14.7 ms ± 35.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:38:47 Samples: 16\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.017 CPU time: 0.023\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-a72911ea-456f-4e5b-8129-d7dd4e04f39f.json\n", + "\n", + "0.017 ../../../tmp/ipykernel_1959742/2991494264.py:2\n", + "`- 0.017 func ../../../tmp/ipykernel_1959742/2380024278.py:1\n", + " `- 0.017 ccc ccc/coef/impl.py:308\n", + " |- 0.013 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.013 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.013 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | |- 0.009 ccc/coef/impl.py:211\n", + " | | `- 0.009 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | [10 frames hidden] concurrent, threading, , ip...\n", + " | | 0.008 lock.acquire \n", + " | |- 0.003 as_completed concurrent/futures/_base.py:201\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | `- 0.001 [self] ccc/coef/impl.py\n", + " |- 0.002 result_iterator concurrent/futures/_base.py:602\n", + " | [5 frames hidden] concurrent, threading, \n", + " |- 0.001 [self] ccc/coef/impl.py\n", + " `- 0.001 ThreadPoolExecutor.__exit__ concurrent/futures/_base.py:636\n", + " [5 frames hidden] concurrent, threading\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "25 ms ± 193 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:38:49 Samples: 26\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.029 CPU time: 0.051\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-a72911ea-456f-4e5b-8129-d7dd4e04f39f.json\n", + "\n", + "0.028 ../../../tmp/ipykernel_1959742/2991494264.py:2\n", + "`- 0.028 func ../../../tmp/ipykernel_1959742/2380024278.py:1\n", + " `- 0.028 ccc ccc/coef/impl.py:308\n", + " |- 0.024 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.024 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.024 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | |- 0.013 ccc/coef/impl.py:211\n", + " | | |- 0.012 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | | [9 frames hidden] concurrent, threading, , ip...\n", + " | | | 0.010 lock.acquire \n", + " | | `- 0.001 [self] ccc/coef/impl.py\n", + " | `- 0.011 as_completed concurrent/futures/_base.py:201\n", + " | [5 frames hidden] concurrent, threading, \n", + " | 0.010 lock.acquire \n", + " |- 0.003 function.map concurrent/futures/_base.py:573\n", + " | [9 frames hidden] concurrent, threading, \n", + " `- 0.001 [self] ccc/coef/impl.py\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "29.4 ms ± 123 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:38:52 Samples: 30\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.032 CPU time: 0.075\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-a72911ea-456f-4e5b-8129-d7dd4e04f39f.json\n", + "\n", + "0.032 ../../../tmp/ipykernel_1959742/2991494264.py:2\n", + "|- 0.031 func ../../../tmp/ipykernel_1959742/2380024278.py:1\n", + "| `- 0.031 ccc ccc/coef/impl.py:308\n", + "| |- 0.023 compute_coef ccc/coef/impl.py:494\n", + "| | `- 0.023 cdist_func ccc/coef/impl.py:487\n", + "| | `- 0.023 cdist_parts_parallel ccc/coef/impl.py:193\n", + "| | |- 0.013 ccc/coef/impl.py:211\n", + "| | | `- 0.013 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + "| | | [8 frames hidden] concurrent, threading, \n", + "| | | 0.010 lock.acquire \n", + "| | |- 0.010 as_completed concurrent/futures/_base.py:201\n", + "| | | [4 frames hidden] concurrent, threading, \n", + "| | | 0.010 lock.acquire \n", + "| | `- 0.001 [self] ccc/coef/impl.py\n", + "| |- 0.003 result_iterator concurrent/futures/_base.py:602\n", + "| | [4 frames hidden] concurrent, threading, \n", + "| |- 0.003 function.map concurrent/futures/_base.py:573\n", + "| | [9 frames hidden] concurrent, threading, , ip...\n", + "| `- 0.001 ThreadPoolExecutor.__exit__ concurrent/futures/_base.py:636\n", + "| [5 frames hidden] concurrent, threading, \n", + "`- 0.001 Profiler.stop pyinstrument/profiler.py:138\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "34.9 ms ± 197 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:38:55 Samples: 34\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.038 CPU time: 0.101\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-a72911ea-456f-4e5b-8129-d7dd4e04f39f.json\n", + "\n", + "0.037 ../../../tmp/ipykernel_1959742/2991494264.py:2\n", + "`- 0.037 func ../../../tmp/ipykernel_1959742/2380024278.py:1\n", + " |- 0.035 ccc ccc/coef/impl.py:308\n", + " | |- 0.029 compute_coef ccc/coef/impl.py:494\n", + " | | `- 0.029 cdist_func ccc/coef/impl.py:487\n", + " | | `- 0.029 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | | |- 0.016 ccc/coef/impl.py:211\n", + " | | | |- 0.014 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | | | [10 frames hidden] concurrent, threading, , ip...\n", + " | | | | 0.011 lock.acquire \n", + " | | | `- 0.002 [self] ccc/coef/impl.py\n", + " | | `- 0.012 as_completed concurrent/futures/_base.py:201\n", + " | | [6 frames hidden] concurrent, threading, \n", + " | | 0.010 lock.acquire \n", + " | |- 0.005 result_iterator concurrent/futures/_base.py:602\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | |- 0.001 full numpy/core/numeric.py:289\n", + " | | [3 frames hidden] numpy, <__array_function__ internals>...\n", + " | `- 0.001 ThreadPoolExecutor.__exit__ concurrent/futures/_base.py:636\n", + " | [4 frames hidden] concurrent, threading\n", + " `- 0.002 [self] ../../../tmp/ipykernel_1959742/2380024278.py\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "827 ms ± 1.54 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:39:09 Samples: 130\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.825 CPU time: 3.196\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-a72911ea-456f-4e5b-8129-d7dd4e04f39f.json\n", + "\n", + "0.824 ../../../tmp/ipykernel_1959742/2991494264.py:2\n", + "`- 0.824 func ../../../tmp/ipykernel_1959742/2380024278.py:1\n", + " `- 0.817 ccc ccc/coef/impl.py:308\n", + " |- 0.461 result_iterator concurrent/futures/_base.py:602\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.461 lock.acquire \n", + " `- 0.344 compute_coef ccc/coef/impl.py:494\n", + " `- 0.343 cdist_func ccc/coef/impl.py:487\n", + " `- 0.343 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 0.328 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.328 lock.acquire \n", + " `- 0.014 ccc/coef/impl.py:211\n", + " `- 0.009 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [2 frames hidden] concurrent\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.67 s ± 6.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:39:37 Samples: 147\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 1.672 CPU time: 6.485\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-a72911ea-456f-4e5b-8129-d7dd4e04f39f.json\n", + "\n", + "1.671 ../../../tmp/ipykernel_1959742/2991494264.py:2\n", + "`- 1.671 func ../../../tmp/ipykernel_1959742/2380024278.py:1\n", + " |- 1.654 ccc ccc/coef/impl.py:308\n", + " | |- 0.948 result_iterator concurrent/futures/_base.py:602\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | | 0.948 lock.acquire \n", + " | |- 0.671 compute_coef ccc/coef/impl.py:494\n", + " | | `- 0.671 cdist_func ccc/coef/impl.py:487\n", + " | | `- 0.671 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | | |- 0.654 as_completed concurrent/futures/_base.py:201\n", + " | | | [4 frames hidden] concurrent, threading, \n", + " | | | 0.654 lock.acquire \n", + " | | `- 0.017 ccc/coef/impl.py:211\n", + " | `- 0.020 [self] ccc/coef/impl.py\n", + " `- 0.018 [self] ../../../tmp/ipykernel_1959742/2380024278.py\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_1_core_cprofile.ipynb b/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_1_core_cprofile.ipynb new file mode 100644 index 00000000..1681c539 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_1_core_cprofile.ipynb @@ -0,0 +1,1556 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Single-threaded version" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5c010e62", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: CM_N_JOBS=1\n" + ] + } + ], + "source": [ + "# Set core count to 1\n", + "%env CM_N_JOBS=1" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/sklearn/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/scipy/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/coef/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/utils/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/pytorch/__pycache__\n" + ] + } + ], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from ccc.coef import ccc\n", + "import cProfile" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10\n", + "np.random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bc5236a7", + "metadata": {}, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=1)" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8.51 ms ± 83.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 12:50:58 Samples: 10\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.011 CPU time: 0.013\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-76f1e206-b6a7-40f2-8d7f-4e421d5c241e.json\n", + "\n", + "0.010 ../../../tmp/ipykernel_359614/2991494264.py:2\n", + "`- 0.010 func ../../../tmp/ipykernel_359614/2661685993.py:1\n", + " `- 0.010 ccc ccc/coef/impl.py:308\n", + " `- 0.010 compute_coef ccc/coef/impl.py:493\n", + " `- 0.010 cdist_func ccc/coef/impl.py:486\n", + " `- 0.010 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 0.009 as_completed concurrent/futures/_base.py:201\n", + " | [5 frames hidden] concurrent, threading, \n", + " | 0.008 lock.acquire \n", + " `- 0.001 [self] ccc/coef/impl.py\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "18.4 ms ± 219 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 12:51:13 Samples: 18\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.020 CPU time: 0.022\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-76f1e206-b6a7-40f2-8d7f-4e421d5c241e.json\n", + "\n", + "0.019 ../../../tmp/ipykernel_359614/2991494264.py:2\n", + "`- 0.019 func ../../../tmp/ipykernel_359614/2661685993.py:1\n", + " |- 0.018 ccc ccc/coef/impl.py:308\n", + " | |- 0.015 compute_coef ccc/coef/impl.py:493\n", + " | | `- 0.015 cdist_func ccc/coef/impl.py:486\n", + " | | `- 0.015 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | | |- 0.013 as_completed concurrent/futures/_base.py:201\n", + " | | | [4 frames hidden] concurrent, threading, \n", + " | | | 0.013 lock.acquire \n", + " | | |- 0.001 ccc/utils/utility_functions.py:117\n", + " | | `- 0.001 ccc/coef/impl.py:211\n", + " | | `- 0.001 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | [5 frames hidden] concurrent, threading\n", + " | |- 0.002 result_iterator concurrent/futures/_base.py:602\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | `- 0.001 ThreadPoolExecutor.__exit__ concurrent/futures/_base.py:636\n", + " | [5 frames hidden] concurrent, threading, \n", + " `- 0.001 [self] ../../../tmp/ipykernel_359614/2661685993.py\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "33.2 ms ± 178 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 12:51:16 Samples: 30\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.035 CPU time: 0.037\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-76f1e206-b6a7-40f2-8d7f-4e421d5c241e.json\n", + "\n", + "0.034 ../../../tmp/ipykernel_359614/2991494264.py:2\n", + "`- 0.034 func ../../../tmp/ipykernel_359614/2661685993.py:1\n", + " `- 0.034 ccc ccc/coef/impl.py:308\n", + " |- 0.025 compute_coef ccc/coef/impl.py:493\n", + " | `- 0.025 cdist_func ccc/coef/impl.py:486\n", + " | `- 0.025 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | |- 0.022 as_completed concurrent/futures/_base.py:201\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | | 0.022 lock.acquire \n", + " | |- 0.002 ccc/coef/impl.py:211\n", + " | | `- 0.002 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | [4 frames hidden] concurrent, threading\n", + " | `- 0.001 Future.result concurrent/futures/_base.py:418\n", + " |- 0.009 result_iterator concurrent/futures/_base.py:602\n", + " | [5 frames hidden] concurrent, threading, \n", + " | 0.008 lock.acquire \n", + " `- 0.001 ThreadPoolExecutor.__exit__ concurrent/futures/_base.py:636\n", + " [3 frames hidden] concurrent, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "55.5 ms ± 477 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 12:51:21 Samples: 41\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.057 CPU time: 0.061\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-76f1e206-b6a7-40f2-8d7f-4e421d5c241e.json\n", + "\n", + "0.057 ../../../tmp/ipykernel_359614/2991494264.py:2\n", + "`- 0.057 func ../../../tmp/ipykernel_359614/2661685993.py:1\n", + " `- 0.057 ccc ccc/coef/impl.py:308\n", + " |- 0.040 compute_coef ccc/coef/impl.py:493\n", + " | `- 0.040 cdist_func ccc/coef/impl.py:486\n", + " | `- 0.040 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 0.040 as_completed concurrent/futures/_base.py:201\n", + " | [5 frames hidden] concurrent, threading, \n", + " | 0.039 lock.acquire \n", + " `- 0.017 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + " 0.017 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.99 s ± 12.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 12:52:09 Samples: 118\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 2.978 CPU time: 2.996\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-76f1e206-b6a7-40f2-8d7f-4e421d5c241e.json\n", + "\n", + "2.977 ../../../tmp/ipykernel_359614/2991494264.py:2\n", + "`- 2.977 func ../../../tmp/ipykernel_359614/2661685993.py:1\n", + " `- 2.966 ccc ccc/coef/impl.py:308\n", + " |- 2.100 compute_coef ccc/coef/impl.py:493\n", + " | `- 2.100 cdist_func ccc/coef/impl.py:486\n", + " | `- 2.100 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 2.093 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 2.093 lock.acquire \n", + " `- 0.864 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + " 0.864 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6.17 s ± 40 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 8394 function calls in 6.163 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 790 6.112 0.008 6.112 0.008 {method 'acquire' of '_thread.lock' objects}\n", + " 1 0.017 0.017 6.163 6.163 2661685993.py:1(func)\n", + " 50 0.010 0.000 0.010 0.000 {built-in method numpy.zeros}\n", + " 10 0.008 0.001 6.145 0.615 impl.py:308(ccc)\n", + " 10 0.005 0.000 0.006 0.001 impl.py:211()\n", + " 10 0.001 0.000 4.302 0.430 impl.py:193(cdist_parts_parallel)\n", + " 100 0.001 0.000 4.294 0.043 _base.py:201(as_completed)\n", + " 200 0.001 0.000 6.113 0.031 threading.py:280(wait)\n", + " 10 0.001 0.000 4.303 0.430 impl.py:493(compute_coef)\n", + " 190 0.000 0.000 0.001 0.000 _base.py:179(_yield_finished_futures)\n", + " 10 0.000 0.000 0.000 0.000 {built-in method _thread.start_new_thread}\n", + " 100 0.000 0.000 0.004 0.000 thread.py:161(submit)\n", + " 100 0.000 0.000 4.293 0.043 threading.py:563(wait)\n", + " 100 0.000 0.000 1.821 0.018 _base.py:418(result)\n", + " 480 0.000 0.000 0.000 0.000 threading.py:259(__exit__)\n", + " 130 0.000 0.000 0.000 0.000 threading.py:228(__init__)\n", + " 20 0.000 0.000 0.000 0.000 impl.py:243(get_chunks)\n", + " 480 0.000 0.000 0.000 0.000 threading.py:256(__enter__)\n", + " 100 0.000 0.000 0.001 0.000 threading.py:411(acquire)\n", + " 10 0.000 0.000 4.302 0.430 impl.py:486(cdist_func)\n", + " 100 0.000 0.000 0.003 0.000 thread.py:180(_adjust_thread_count)\n", + " 10 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 190 0.000 0.000 0.000 0.000 threading.py:268(_acquire_restore)\n", + " 90 0.000 0.000 0.000 0.000 threading.py:553(clear)\n", + " 100 0.000 0.000 0.000 0.000 _base.py:318(__init__)\n", + " 40 0.000 0.000 0.000 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}\n", + " 110 0.000 0.000 0.000 0.000 {method 'put' of '_queue.SimpleQueue' objects}\n", + " 30 0.000 0.000 0.000 0.000 {built-in method numpy.arange}\n", + " 10 0.000 0.000 0.000 0.000 fromnumeric.py:69(_wrapreduction)\n", + " 600 0.000 0.000 0.000 0.000 {method '__exit__' of '_thread.lock' objects}\n", + " 140 0.000 0.000 0.000 0.000 utility_functions.py:117()\n", + " 10 0.000 0.000 0.000 0.000 thread.py:123(__init__)\n", + " 190 0.000 0.000 0.000 0.000 threading.py:271(_is_owned)\n", + " 190 0.000 0.000 0.000 0.000 threading.py:265(_release_save)\n", + " 30 0.000 0.000 0.000 0.000 numeric.py:289(full)\n", + " 250 0.000 0.000 0.000 0.000 {built-in method _thread.allocate_lock}\n", + " 20 0.000 0.000 1.821 0.091 _base.py:602(result_iterator)\n", + " 190 0.000 0.000 0.000 0.000 {method '__enter__' of '_thread.RLock' objects}\n", + " 180 0.000 0.000 0.000 0.000 {method 'remove' of 'set' objects}\n", + " 10 0.000 0.000 0.000 0.000 threading.py:802(__init__)\n", + " 10 0.000 0.000 0.000 0.000 threading.py:992(_stop)\n", + " 30 0.000 0.000 0.000 0.000 utility_functions.py:109(chunker)\n", + " 10 0.000 0.000 0.001 0.000 thread.py:216(shutdown)\n", + " 30 0.000 0.000 0.000 0.000 {built-in method numpy.empty}\n", + " 90 0.000 0.000 0.000 0.000 {method 'remove' of 'list' objects}\n", + " 100 0.000 0.000 0.000 0.000 thread.py:47(__init__)\n", + " 100 0.000 0.000 0.000 0.000 _base.py:388(__get_result)\n", + " 100 0.000 0.000 0.000 0.000 threading.py:82(RLock)\n", + " 10 0.000 0.000 0.000 0.000 {method 'argmax' of 'numpy.ndarray' objects}\n", + " 100 0.000 0.000 0.000 0.000 {method 'pop' of 'list' objects}\n", + " 290 0.000 0.000 0.000 0.000 {method '__enter__' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.000 0.000 core.py:85(unravel_index_2d)\n", + " 30 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(copyto)\n", + " 10 0.000 0.000 0.001 0.000 threading.py:1028(join)\n", + " 10 0.000 0.000 0.000 0.000 {built-in method builtins.sorted}\n", + " 10 0.000 0.000 0.000 0.000 _base.py:157(_create_and_install_waiters)\n", + " 10 0.000 0.000 0.001 0.000 threading.py:1066(_wait_for_tstate_lock)\n", + " 10 0.000 0.000 0.000 0.000 fromnumeric.py:2638(amax)\n", + " 10 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(amax)\n", + " 10 0.000 0.000 0.000 0.000 impl.py:75(get_range_n_clusters)\n", + " 200 0.000 0.000 0.000 0.000 {method '__exit__' of '_thread.RLock' objects}\n", + " 20 0.000 0.000 0.000 0.000 threading.py:1358(current_thread)\n", + " 10 0.000 0.000 0.001 0.000 _base.py:636(__exit__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.compile}\n", + " 10 0.000 0.000 0.002 0.000 threading.py:880(start)\n", + " 10 0.000 0.000 0.000 0.000 _base.py:79(__init__)\n", + " 200 0.000 0.000 0.000 0.000 {method 'append' of 'collections.deque' objects}\n", + " 10 0.000 0.000 0.000 0.000 threading.py:775(_maintain_shutdown_locks)\n", + " 10 0.000 0.000 0.000 0.000 weakref.py:370(remove)\n", + " 10 0.000 0.000 0.000 0.000 {method '_acquire_restore' of '_thread.RLock' objects}\n", + " 100 0.000 0.000 0.000 0.000 {method 'reverse' of 'list' objects}\n", + " 10 0.000 0.000 0.000 0.000 _base.py:149(__enter__)\n", + " 180 0.000 0.000 0.000 0.000 {built-in method time.monotonic}\n", + " 200 0.000 0.000 0.000 0.000 {method 'release' of '_thread.lock' objects}\n", + " 10 0.000 0.000 0.000 0.000 ipkernel.py:763(init_closure)\n", + " 20 0.000 0.000 0.000 0.000 threading.py:528(__init__)\n", + " 10 0.000 0.000 0.002 0.000 _base.py:573(map)\n", + " 182 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 10 0.000 0.000 0.002 0.000 _base.py:598()\n", + " 20 0.000 0.000 0.000 0.000 impl.py:285(get_feature_type_and_encode)\n", + " 10 0.000 0.000 0.000 0.000 _base.py:63(__init__)\n", + " 10 0.000 0.000 0.000 0.000 threading.py:405(__init__)\n", + " 10 0.000 0.000 0.000 0.000 threading.py:785()\n", + " 10 0.000 0.000 0.000 0.000 _weakrefset.py:39(_remove)\n", + " 14 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 10 0.000 0.000 0.000 0.000 impl.py:219(get_coords_from_index)\n", + " 10 0.000 0.000 0.000 0.000 _base.py:146(__init__)\n", + " 10 0.000 0.000 0.000 0.000 _base.py:153(__exit__)\n", + " 10 0.000 0.000 0.000 0.000 fromnumeric.py:70()\n", + " 2 0.000 0.000 6.163 3.081 interactiveshell.py:3511(run_code)\n", + " 10 0.000 0.000 0.000 0.000 _base.py:225()\n", + " 10 0.000 0.000 0.000 0.000 _weakrefset.py:86(add)\n", + " 20 0.000 0.000 0.000 0.000 threading.py:1147(daemon)\n", + " 10 0.000 0.000 0.000 0.000 {method 'difference_update' of 'set' objects}\n", + " 10 0.000 0.000 0.000 0.000 weakref.py:428(__setitem__)\n", + " 10 0.000 0.000 0.000 0.000 threading.py:1229(_make_invoke_excepthook)\n", + " 90 0.000 0.000 0.000 0.000 {method 'remove' of 'collections.deque' objects}\n", + " 90 0.000 0.000 0.000 0.000 {method 'acquire' of '_thread.RLock' objects}\n", + " 2 0.000 0.000 0.000 0.000 interactiveshell.py:3336(_update_code_co_name)\n", + " 10 0.000 0.000 0.000 0.000 {built-in method numpy.asarray}\n", + " 30 0.000 0.000 0.000 0.000 multiarray.py:1071(copyto)\n", + " 90 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 30 0.000 0.000 0.000 0.000 {method 'locked' of '_thread.lock' objects}\n", + " 20 0.000 0.000 0.000 0.000 {built-in method builtins.isinstance}\n", + " 20 0.000 0.000 0.000 0.000 threading.py:536(is_set)\n", + " 20 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2 0.000 0.000 0.000 0.000 codeop.py:142(__call__)\n", + " 4 0.000 0.000 0.000 0.000 dis.py:449(findlinestarts)\n", + " 30 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 10 0.000 0.000 0.000 0.000 {method 'items' of 'dict' objects}\n", + " 90 0.000 0.000 0.000 0.000 {method 'release' of '_thread.RLock' objects}\n", + " 10 0.000 0.000 0.000 0.000 {method '_release_save' of '_thread.RLock' objects}\n", + " 2 0.000 0.000 0.000 0.000 traitlets.py:676(__get__)\n", + " 2 0.000 0.000 0.000 0.000 contextlib.py:86(__init__)\n", + " 10 0.000 0.000 0.000 0.000 fromnumeric.py:2633(_amax_dispatcher)\n", + " 2 0.000 0.000 0.000 0.000 hooks.py:103(__call__)\n", + " 10 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 2 0.000 0.000 0.000 0.000 traitlets.py:629(get)\n", + " 4 0.000 0.000 0.000 0.000 compilerop.py:166(extra_flags)\n", + " 1 0.000 0.000 0.000 0.000 689822237.py:4()\n", + " 2 0.000 0.000 0.000 0.000 contextlib.py:261(helper)\n", + " 6 0.000 0.000 0.000 0.000 {built-in method builtins.next}\n", + " 2 0.000 0.000 6.163 3.081 {built-in method builtins.exec}\n", + " 2 0.000 0.000 0.000 0.000 contextlib.py:114(__enter__)\n", + " 10 0.000 0.000 0.000 0.000 {method '_is_owned' of '_thread.RLock' objects}\n", + " 10 0.000 0.000 0.000 0.000 _base.py:633(__enter__)\n", + " 2 0.000 0.000 0.000 0.000 contextlib.py:123(__exit__)\n", + " 2 0.000 0.000 0.000 0.000 ipstruct.py:125(__getattr__)\n", + " 2 0.000 0.000 0.000 0.000 interactiveshell.py:3448(compare)\n", + " 2 0.000 0.000 0.000 0.000 {method 'replace' of 'code' objects}\n", + " 2 0.000 0.000 0.000 0.000 interactiveshell.py:1301(user_global_ns)\n", + " 1 0.000 0.000 6.163 6.163 689822237.py:3()\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 2 0.000 0.000 0.000 0.000 hooks.py:168(pre_run_code_hook)\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 4 0.000 0.000 0.000 0.000 typing.py:1375(cast)\n", + "\n", + "\n" + ] + } + ], + "source": [ + "%%prun -s cumulative -l 20\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_1_core_pyinstrument.ipynb b/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_1_core_pyinstrument.ipynb new file mode 100644 index 00000000..347d1eba --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_1_core_pyinstrument.ipynb @@ -0,0 +1,1437 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Single-threaded version, force to use `cdist_parts_basic` to see the overhead of `air` " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5c010e62", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: CM_N_JOBS=1\n" + ] + } + ], + "source": [ + "# Set core count to 1\n", + "%env CM_N_JOBS=1" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/__pycache__\n" + ] + } + ], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from ccc.coef import ccc\n", + "%load_ext pyinstrument" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10\n", + "np.random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bc5236a7", + "metadata": {}, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=1)" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8.63 ms ± 147 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:30:54 Samples: 10\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.011 CPU time: 0.012\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-8902ea9b-48cc-4415-91ca-e3b7a7a3f867.json\n", + "\n", + "0.010 ../../../tmp/ipykernel_1934644/2991494264.py:2\n", + "`- 0.010 func ../../../tmp/ipykernel_1934644/2661685993.py:1\n", + " `- 0.010 ccc ccc/coef/impl.py:308\n", + " |- 0.008 compute_coef ccc/coef/impl.py:494\n", + " | |- 0.006 cdist_func ccc/coef/impl.py:487\n", + " | | `- 0.006 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | | |- 0.005 as_completed concurrent/futures/_base.py:201\n", + " | | | [4 frames hidden] concurrent, threading, \n", + " | | | 0.005 lock.acquire \n", + " | | `- 0.001 [self] ccc/coef/impl.py\n", + " | |- 0.001 [self] ccc/coef/impl.py\n", + " | `- 0.001 amax <__array_function__ internals>:2\n", + " | [4 frames hidden] <__array_function__ internals>, numpy...\n", + " `- 0.002 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + " 0.002 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "18.9 ms ± 289 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:31:10 Samples: 20\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.021 CPU time: 0.023\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-8902ea9b-48cc-4415-91ca-e3b7a7a3f867.json\n", + "\n", + "0.021 ../../../tmp/ipykernel_1934644/2991494264.py:2\n", + "`- 0.021 func ../../../tmp/ipykernel_1934644/2661685993.py:1\n", + " `- 0.021 ccc ccc/coef/impl.py:308\n", + " |- 0.016 compute_coef ccc/coef/impl.py:494\n", + " | |- 0.014 cdist_func ccc/coef/impl.py:487\n", + " | | `- 0.014 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | | `- 0.014 as_completed concurrent/futures/_base.py:201\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | | 0.014 lock.acquire \n", + " | |- 0.001 amax <__array_function__ internals>:2\n", + " | | [4 frames hidden] <__array_function__ internals>, numpy\n", + " | `- 0.001 [self] ccc/coef/impl.py\n", + " |- 0.003 ThreadPoolExecutor.__exit__ concurrent/futures/_base.py:636\n", + " | [5 frames hidden] concurrent, threading, \n", + " |- 0.001 [self] ccc/coef/impl.py\n", + " `- 0.001 function.map concurrent/futures/_base.py:573\n", + " [8 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "34 ms ± 344 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:31:13 Samples: 30\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.036 CPU time: 0.039\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-8902ea9b-48cc-4415-91ca-e3b7a7a3f867.json\n", + "\n", + "0.036 ../../../tmp/ipykernel_1934644/2991494264.py:2\n", + "`- 0.036 func ../../../tmp/ipykernel_1934644/2661685993.py:1\n", + " `- 0.036 ccc ccc/coef/impl.py:308\n", + " |- 0.025 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.025 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.025 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | |- 0.023 as_completed concurrent/futures/_base.py:201\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | | 0.023 lock.acquire \n", + " | `- 0.002 ccc/coef/impl.py:211\n", + " `- 0.011 result_iterator concurrent/futures/_base.py:602\n", + " [5 frames hidden] concurrent, threading, \n", + " 0.010 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "56.8 ms ± 342 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:31:18 Samples: 42\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.061 CPU time: 0.064\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-8902ea9b-48cc-4415-91ca-e3b7a7a3f867.json\n", + "\n", + "0.060 ../../../tmp/ipykernel_1934644/2991494264.py:2\n", + "`- 0.060 func ../../../tmp/ipykernel_1934644/2661685993.py:1\n", + " `- 0.060 ccc ccc/coef/impl.py:308\n", + " |- 0.039 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.039 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.039 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 0.039 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.039 lock.acquire \n", + " `- 0.021 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + " 0.021 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.03 s ± 14.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:32:07 Samples: 117\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 3.032 CPU time: 3.049\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-8902ea9b-48cc-4415-91ca-e3b7a7a3f867.json\n", + "\n", + "3.032 ../../../tmp/ipykernel_1934644/2991494264.py:2\n", + "`- 3.032 func ../../../tmp/ipykernel_1934644/2661685993.py:1\n", + " `- 3.026 ccc ccc/coef/impl.py:308\n", + " |- 2.140 compute_coef ccc/coef/impl.py:494\n", + " | `- 2.140 cdist_func ccc/coef/impl.py:487\n", + " | `- 2.140 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 2.135 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 2.134 lock.acquire \n", + " `- 0.879 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + " 0.879 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6.21 s ± 25.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:33:49 Samples: 130\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 6.094 CPU time: 6.112\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-8902ea9b-48cc-4415-91ca-e3b7a7a3f867.json\n", + "\n", + "6.094 ../../../tmp/ipykernel_1934644/2991494264.py:2\n", + "`- 6.094 func ../../../tmp/ipykernel_1934644/2661685993.py:1\n", + " `- 6.080 ccc ccc/coef/impl.py:308\n", + " |- 4.257 compute_coef ccc/coef/impl.py:494\n", + " | `- 4.257 cdist_func ccc/coef/impl.py:487\n", + " | `- 4.257 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 4.247 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 4.247 lock.acquire \n", + " `- 1.810 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + " 1.810 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_24_core_pyinstrument.ipynb b/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_24_core_pyinstrument.ipynb new file mode 100644 index 00000000..de15fd46 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_24_core_pyinstrument.ipynb @@ -0,0 +1,1477 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Single-threaded version, force to use `cdist_parts_basic` to see the overhead of `air` " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5c010e62", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: CM_N_JOBS=1\n" + ] + } + ], + "source": [ + "# Set core count to 1\n", + "# %env CM_N_JOBS=1" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/sklearn/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/scipy/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/coef/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/utils/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/pytorch/__pycache__\n" + ] + } + ], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from ccc.coef import ccc\n", + "%load_ext pyinstrument" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10\n", + "np.random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bc5236a7", + "metadata": {}, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=24)" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14.8 ms ± 83.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:45:50 Samples: 16\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.017 CPU time: 0.023\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-63ceb30c-8d12-4cd8-83db-a05c2fd442ee.json\n", + "\n", + "0.017 ../../../tmp/ipykernel_2035782/2991494264.py:2\n", + "`- 0.017 func ../../../tmp/ipykernel_2035782/2287242631.py:1\n", + " `- 0.017 ccc ccc/coef/impl.py:308\n", + " |- 0.013 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.013 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.013 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | |- 0.009 ccc/coef/impl.py:211\n", + " | | `- 0.009 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | [7 frames hidden] concurrent, threading, , ip...\n", + " | | 0.008 lock.acquire \n", + " | `- 0.004 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.004 lock.acquire \n", + " |- 0.002 function.map concurrent/futures/_base.py:573\n", + " | [8 frames hidden] concurrent, threading, \n", + " |- 0.001 result_iterator concurrent/futures/_base.py:602\n", + " | [3 frames hidden] concurrent, threading\n", + " `- 0.001 zeros \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "25.1 ms ± 189 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:45:52 Samples: 25\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.028 CPU time: 0.048\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-63ceb30c-8d12-4cd8-83db-a05c2fd442ee.json\n", + "\n", + "0.027 ../../../tmp/ipykernel_2035782/2991494264.py:2\n", + "`- 0.027 func ../../../tmp/ipykernel_2035782/2287242631.py:1\n", + " `- 0.027 ccc ccc/coef/impl.py:308\n", + " |- 0.021 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.021 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.021 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | |- 0.012 ccc/coef/impl.py:211\n", + " | | `- 0.012 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | [8 frames hidden] concurrent, threading, \n", + " | | 0.009 lock.acquire \n", + " | |- 0.008 as_completed concurrent/futures/_base.py:201\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | | 0.008 lock.acquire \n", + " | `- 0.001 [self] ccc/coef/impl.py\n", + " |- 0.005 function.map concurrent/futures/_base.py:573\n", + " | [10 frames hidden] concurrent, threading, \n", + " `- 0.001 ThreadPoolExecutor.__exit__ concurrent/futures/_base.py:636\n", + " [5 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "29.4 ms ± 264 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:45:55 Samples: 31\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.034 CPU time: 0.076\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-63ceb30c-8d12-4cd8-83db-a05c2fd442ee.json\n", + "\n", + "0.033 ../../../tmp/ipykernel_2035782/2991494264.py:2\n", + "`- 0.033 func ../../../tmp/ipykernel_2035782/2287242631.py:1\n", + " |- 0.032 ccc ccc/coef/impl.py:308\n", + " | |- 0.025 compute_coef ccc/coef/impl.py:494\n", + " | | `- 0.025 cdist_func ccc/coef/impl.py:487\n", + " | | `- 0.025 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | | |- 0.015 ccc/coef/impl.py:211\n", + " | | | `- 0.015 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | | [9 frames hidden] concurrent, threading, , we...\n", + " | | | 0.010 lock.acquire \n", + " | | |- 0.008 as_completed concurrent/futures/_base.py:201\n", + " | | | [4 frames hidden] concurrent, threading, \n", + " | | | 0.008 lock.acquire \n", + " | | `- 0.002 [self] ccc/coef/impl.py\n", + " | |- 0.003 function.map concurrent/futures/_base.py:573\n", + " | | [10 frames hidden] concurrent, threading, \n", + " | |- 0.002 result_iterator concurrent/futures/_base.py:602\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | |- 0.001 ThreadPoolExecutor.__exit__ concurrent/futures/_base.py:636\n", + " | | [5 frames hidden] concurrent, threading, \n", + " | `- 0.001 zeros \n", + " `- 0.001 _remove _weakrefset.py:39\n", + " [2 frames hidden] _weakrefset, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "34.9 ms ± 177 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:45:58 Samples: 34\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.038 CPU time: 0.105\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-63ceb30c-8d12-4cd8-83db-a05c2fd442ee.json\n", + "\n", + "0.037 ../../../tmp/ipykernel_2035782/2991494264.py:2\n", + "`- 0.037 func ../../../tmp/ipykernel_2035782/2287242631.py:1\n", + " `- 0.037 ccc ccc/coef/impl.py:308\n", + " |- 0.030 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.030 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.030 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | |- 0.015 ccc/coef/impl.py:211\n", + " | | |- 0.014 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | | [9 frames hidden] concurrent, threading, \n", + " | | | 0.011 lock.acquire \n", + " | | `- 0.001 [self] ccc/coef/impl.py\n", + " | |- 0.011 as_completed concurrent/futures/_base.py:201\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | | 0.011 lock.acquire \n", + " | |- 0.002 [self] ccc/coef/impl.py\n", + " | `- 0.001 ccc/utils/utility_functions.py:117\n", + " |- 0.004 result_iterator concurrent/futures/_base.py:602\n", + " | [4 frames hidden] concurrent, threading, \n", + " |- 0.003 function.map concurrent/futures/_base.py:573\n", + " | [9 frames hidden] concurrent, threading, \n", + " `- 0.001 [self] ccc/coef/impl.py\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "827 ms ± 4.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:46:12 Samples: 136\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.838 CPU time: 3.210\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-63ceb30c-8d12-4cd8-83db-a05c2fd442ee.json\n", + "\n", + "0.838 ../../../tmp/ipykernel_2035782/2991494264.py:2\n", + "`- 0.838 func ../../../tmp/ipykernel_2035782/2287242631.py:1\n", + " |- 0.827 ccc ccc/coef/impl.py:308\n", + " | |- 0.464 result_iterator concurrent/futures/_base.py:602\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | | 0.464 lock.acquire \n", + " | |- 0.344 compute_coef ccc/coef/impl.py:494\n", + " | | `- 0.344 cdist_func ccc/coef/impl.py:487\n", + " | | `- 0.344 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | | |- 0.330 as_completed concurrent/futures/_base.py:201\n", + " | | | [4 frames hidden] concurrent, threading, \n", + " | | | 0.329 lock.acquire \n", + " | | `- 0.014 ccc/coef/impl.py:211\n", + " | | `- 0.012 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | [3 frames hidden] concurrent, threading\n", + " | `- 0.014 [self] ccc/coef/impl.py\n", + " `- 0.011 [self] ../../../tmp/ipykernel_2035782/2287242631.py\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.67 s ± 7.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 20:46:40 Samples: 148\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 1.674 CPU time: 6.470\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-63ceb30c-8d12-4cd8-83db-a05c2fd442ee.json\n", + "\n", + "1.673 ../../../tmp/ipykernel_2035782/2991494264.py:2\n", + "`- 1.673 func ../../../tmp/ipykernel_2035782/2287242631.py:1\n", + " |- 1.654 ccc ccc/coef/impl.py:308\n", + " | |- 0.952 result_iterator concurrent/futures/_base.py:602\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | | 0.952 lock.acquire \n", + " | |- 0.668 compute_coef ccc/coef/impl.py:494\n", + " | | `- 0.668 cdist_func ccc/coef/impl.py:487\n", + " | | `- 0.668 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | | |- 0.650 as_completed concurrent/futures/_base.py:201\n", + " | | | [4 frames hidden] concurrent, threading, \n", + " | | | 0.650 lock.acquire \n", + " | | `- 0.018 ccc/coef/impl.py:211\n", + " | `- 0.020 [self] ccc/coef/impl.py\n", + " `- 0.019 [self] ../../../tmp/ipykernel_2035782/2287242631.py\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_6_core_pyinstrument.ipynb b/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_6_core_pyinstrument.ipynb new file mode 100644 index 00000000..4fc26b4f --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/02_cpu_version_more_profilers/00_6_core_pyinstrument.ipynb @@ -0,0 +1,1453 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Single-threaded version, force to use `cdist_parts_basic` to see the overhead of `air` " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5c010e62", + "metadata": {}, + "outputs": [], + "source": [ + "# Set core count to 1\n", + "# %env CM_N_JOBS=1" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/sklearn/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/scipy/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/coef/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/utils/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/pytorch/__pycache__\n" + ] + } + ], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from ccc.coef import ccc\n", + "%load_ext pyinstrument" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10\n", + "np.random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bc5236a7", + "metadata": {}, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=6)" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14.7 ms ± 36.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 10:15:03 Samples: 17\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.018 CPU time: 0.024\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-725a4d21-ead1-453d-ad0d-e5e7411475f1.json\n", + "\n", + "0.017 ../../../tmp/ipykernel_2706800/2991494264.py:2\n", + "`- 0.017 func ../../../tmp/ipykernel_2706800/1687822962.py:1\n", + " `- 0.017 ccc ccc/coef/impl.py:308\n", + " |- 0.014 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.014 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.014 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | |- 0.010 ccc/coef/impl.py:211\n", + " | | `- 0.010 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | [9 frames hidden] concurrent, threading, , ip...\n", + " | | 0.008 lock.acquire \n", + " | `- 0.004 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.004 lock.acquire \n", + " |- 0.002 function.map concurrent/futures/_base.py:573\n", + " | [11 frames hidden] concurrent, threading, , ip...\n", + " `- 0.001 ThreadPoolExecutor.__exit__ concurrent/futures/_base.py:636\n", + " [5 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23.6 ms ± 224 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 10:15:05 Samples: 24\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.027 CPU time: 0.045\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-725a4d21-ead1-453d-ad0d-e5e7411475f1.json\n", + "\n", + "0.026 ../../../tmp/ipykernel_2706800/2991494264.py:2\n", + "`- 0.026 func ../../../tmp/ipykernel_2706800/1687822962.py:1\n", + " `- 0.026 ccc ccc/coef/impl.py:308\n", + " |- 0.022 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.022 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.022 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | |- 0.011 ccc/coef/impl.py:211\n", + " | | `- 0.011 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | [8 frames hidden] concurrent, threading, \n", + " | | 0.009 lock.acquire \n", + " | |- 0.010 as_completed concurrent/futures/_base.py:201\n", + " | | [7 frames hidden] concurrent, threading, \n", + " | | 0.009 lock.acquire \n", + " | `- 0.001 [self] ccc/coef/impl.py\n", + " |- 0.003 function.map concurrent/futures/_base.py:573\n", + " | [9 frames hidden] concurrent, threading, \n", + " `- 0.001 [self] ccc/coef/impl.py\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "27.6 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 10:15:08 Samples: 29\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.031 CPU time: 0.073\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-725a4d21-ead1-453d-ad0d-e5e7411475f1.json\n", + "\n", + "0.031 ../../../tmp/ipykernel_2706800/2991494264.py:2\n", + "`- 0.031 func ../../../tmp/ipykernel_2706800/1687822962.py:1\n", + " |- 0.030 ccc ccc/coef/impl.py:308\n", + " | |- 0.023 compute_coef ccc/coef/impl.py:494\n", + " | | `- 0.023 cdist_func ccc/coef/impl.py:487\n", + " | | `- 0.023 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | | |- 0.015 as_completed concurrent/futures/_base.py:201\n", + " | | | [8 frames hidden] concurrent, threading, \n", + " | | | 0.014 lock.acquire \n", + " | | `- 0.008 ccc/coef/impl.py:211\n", + " | | `- 0.008 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | [13 frames hidden] concurrent, threading, , ip...\n", + " | |- 0.002 result_iterator concurrent/futures/_base.py:602\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | |- 0.002 [self] ccc/coef/impl.py\n", + " | |- 0.001 function.map concurrent/futures/_base.py:573\n", + " | | [8 frames hidden] concurrent, threading, \n", + " | `- 0.001 ThreadPoolExecutor.__exit__ concurrent/futures/_base.py:636\n", + " | [3 frames hidden] concurrent, threading\n", + " `- 0.001 [self] ../../../tmp/ipykernel_2706800/1687822962.py\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "32.4 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 10:15:11 Samples: 31\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.036 CPU time: 0.098\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-725a4d21-ead1-453d-ad0d-e5e7411475f1.json\n", + "\n", + "0.035 ../../../tmp/ipykernel_2706800/2991494264.py:2\n", + "`- 0.035 func ../../../tmp/ipykernel_2706800/1687822962.py:1\n", + " `- 0.035 ccc ccc/coef/impl.py:308\n", + " |- 0.023 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.023 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.023 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | |- 0.018 as_completed concurrent/futures/_base.py:201\n", + " | | [4 frames hidden] concurrent, threading, \n", + " | | 0.018 lock.acquire \n", + " | `- 0.005 ccc/coef/impl.py:211\n", + " | |- 0.004 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " | | [10 frames hidden] concurrent, threading, \n", + " | `- 0.001 [self] ccc/coef/impl.py\n", + " |- 0.011 result_iterator concurrent/futures/_base.py:602\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.011 lock.acquire \n", + " `- 0.001 function.map concurrent/futures/_base.py:573\n", + " [8 frames hidden] concurrent, ipykernel, threading\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.02 s ± 2.82 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 10:15:27 Samples: 125\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 1.034 CPU time: 3.193\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-725a4d21-ead1-453d-ad0d-e5e7411475f1.json\n", + "\n", + "1.033 ../../../tmp/ipykernel_2706800/2991494264.py:2\n", + "`- 1.033 func ../../../tmp/ipykernel_2706800/1687822962.py:1\n", + " `- 1.022 ccc ccc/coef/impl.py:308\n", + " |- 0.539 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.539 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.539 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 0.533 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.531 lock.acquire \n", + " `- 0.471 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + " 0.470 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.07 s ± 8.15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 10:16:02 Samples: 141\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 2.064 CPU time: 6.393\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-725a4d21-ead1-453d-ad0d-e5e7411475f1.json\n", + "\n", + "2.063 ../../../tmp/ipykernel_2706800/2991494264.py:2\n", + "`- 2.063 func ../../../tmp/ipykernel_2706800/1687822962.py:1\n", + " `- 2.046 ccc ccc/coef/impl.py:308\n", + " |- 1.058 compute_coef ccc/coef/impl.py:494\n", + " | `- 1.058 cdist_func ccc/coef/impl.py:487\n", + " | `- 1.058 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 1.048 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 1.048 lock.acquire \n", + " `- 0.953 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + " 0.953 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_12_cpu_Core copy 3.ipynb b/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_12_cpu_Core copy 3.ipynb new file mode 100644 index 00000000..6f28ec6f --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_12_cpu_Core copy 3.ipynb @@ -0,0 +1,1503 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Single-threaded version, force to use `cdist_parts_basic` to see the overhead of `air` " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5c010e62", + "metadata": {}, + "outputs": [], + "source": [ + "# Set core count to 1\n", + "# %env CM_N_JOBS=1" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/sklearn/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/scipy/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/coef/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/utils/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/pytorch/__pycache__\n" + ] + } + ], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2024-06-28 11:41:36,716 - numba.cuda.cudadrv.driver] INFO: init\n", + "[2024-06-28 11:41:36,811 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:41:36,812 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:41:36,812 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,813 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,813 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:41:36,889 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:41:36,890 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:41:36,892 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:41:36,892 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:41:36,892 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,893 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,893 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:41:36,893 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:41:36,893 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,894 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,894 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:41:36,894 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:41:36,895 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:41:36,895 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:41:36,896 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:41:36,896 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,896 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,896 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:41:36,897 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:41:36,897 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:41:36,899 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:41:36,899 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:41:36,899 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,900 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,900 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:41:36,900 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:41:36,901 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:41:36,902 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:41:36,902 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:41:36,902 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:41:36,902 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:41:36,902 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:41:36,903 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:41:36,903 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,903 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,903 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:41:36,904 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:41:36,904 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:41:36,904 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:41:36,904 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:41:36,904 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,905 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:41:36,905 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-28 11:41:36,905 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-28 11:41:36,906 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from ccc.coef import ccc\n", + "%load_ext pyinstrument" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10\n", + "np.random.seed(0)\n", + "\n", + "# Disable numba logging\n", + "import logging\n", + "\n", + "numba_logger = logging.getLogger('numba')\n", + "numba_logger.setLevel(logging.WARNING)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bc5236a7", + "metadata": {}, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=12)" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "851 ms ± 23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:41:50 Samples: 50\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.877 CPU time: 1.054\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-be61aca2-fe7c-48f0-8dd2-214e4d957c13.json\n", + "\n", + "0.877 ../../../tmp/ipykernel_2854891/2991494264.py:2\n", + "`- 0.877 func ../../../tmp/ipykernel_2854891/2380024278.py:1\n", + " `- 0.876 ccc ccc/coef/impl.py:308\n", + " `- 0.871 compute_coef ccc/coef/impl.py:494\n", + " `- 0.871 cdist_func ccc/coef/impl.py:487\n", + " `- 0.870 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 0.844 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.844 lock.acquire \n", + " `- 0.026 ccc/coef/impl.py:211\n", + " `- 0.026 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.96 s ± 20.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:42:23 Samples: 81\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 2.064 CPU time: 2.494\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-be61aca2-fe7c-48f0-8dd2-214e4d957c13.json\n", + "\n", + "2.064 ../../../tmp/ipykernel_2854891/2991494264.py:2\n", + "`- 2.064 func ../../../tmp/ipykernel_2854891/2380024278.py:1\n", + " `- 2.064 ccc ccc/coef/impl.py:308\n", + " `- 2.058 compute_coef ccc/coef/impl.py:494\n", + " `- 2.058 cdist_func ccc/coef/impl.py:487\n", + " `- 2.058 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 2.009 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 2.009 lock.acquire \n", + " `- 0.048 ccc/coef/impl.py:211\n", + " `- 0.048 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.96 s ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:42:57 Samples: 83\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 2.112 CPU time: 2.552\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-be61aca2-fe7c-48f0-8dd2-214e4d957c13.json\n", + "\n", + "2.112 ../../../tmp/ipykernel_2854891/2991494264.py:2\n", + "`- 2.112 func ../../../tmp/ipykernel_2854891/2380024278.py:1\n", + " `- 2.112 ccc ccc/coef/impl.py:308\n", + " `- 2.102 compute_coef ccc/coef/impl.py:494\n", + " `- 2.101 cdist_func ccc/coef/impl.py:487\n", + " `- 2.101 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 2.057 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 2.056 lock.acquire \n", + " `- 0.043 ccc/coef/impl.py:211\n", + " `- 0.043 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.99 s ± 18.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:43:31 Samples: 84\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 2.073 CPU time: 2.539\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-be61aca2-fe7c-48f0-8dd2-214e4d957c13.json\n", + "\n", + "2.073 ../../../tmp/ipykernel_2854891/2991494264.py:2\n", + "`- 2.073 func ../../../tmp/ipykernel_2854891/2380024278.py:1\n", + " `- 2.073 ccc ccc/coef/impl.py:308\n", + " `- 2.062 compute_coef ccc/coef/impl.py:494\n", + " `- 2.062 cdist_func ccc/coef/impl.py:487\n", + " `- 2.062 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 2.019 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 2.019 lock.acquire \n", + " `- 0.042 ccc/coef/impl.py:211\n", + " `- 0.042 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.88 s ± 15.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:44:36 Samples: 100\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 3.886 CPU time: 6.037\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-be61aca2-fe7c-48f0-8dd2-214e4d957c13.json\n", + "\n", + "3.885 ../../../tmp/ipykernel_2854891/2991494264.py:2\n", + "`- 3.885 func ../../../tmp/ipykernel_2854891/2380024278.py:1\n", + " `- 3.876 ccc ccc/coef/impl.py:308\n", + " |- 3.402 compute_coef ccc/coef/impl.py:494\n", + " | `- 3.401 cdist_func ccc/coef/impl.py:487\n", + " | `- 3.400 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 3.372 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 3.371 lock.acquire \n", + " `- 0.464 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6.42 s ± 51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:46:22 Samples: 113\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 6.543 CPU time: 10.126\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-be61aca2-fe7c-48f0-8dd2-214e4d957c13.json\n", + "\n", + "6.543 ../../../tmp/ipykernel_2854891/2991494264.py:2\n", + "`- 6.543 func ../../../tmp/ipykernel_2854891/2380024278.py:1\n", + " `- 6.532 ccc ccc/coef/impl.py:308\n", + " |- 5.548 compute_coef ccc/coef/impl.py:494\n", + " | `- 5.547 cdist_func ccc/coef/impl.py:487\n", + " | `- 5.547 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 5.530 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 5.527 lock.acquire \n", + " `- 0.960 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_1_cpu_Core.ipynb b/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_1_cpu_Core.ipynb new file mode 100644 index 00000000..05d983a5 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_1_cpu_Core.ipynb @@ -0,0 +1,1424 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Single-threaded version, force to use `cdist_parts_basic` to see the overhead of `air` " + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "5c010e62", + "metadata": {}, + "outputs": [], + "source": [ + "# Set core count to 1\n", + "# %env CM_N_JOBS=1" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The pyinstrument extension is already loaded. To reload it, use:\n", + " %reload_ext pyinstrument\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from ccc.coef import ccc\n", + "%load_ext pyinstrument" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10\n", + "np.random.seed(0)\n", + "\n", + "# Disable numba logging\n", + "import logging\n", + "\n", + "numba_logger = logging.getLogger('numba')\n", + "numba_logger.setLevel(logging.WARNING)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "bc5236a7", + "metadata": {}, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=1)" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "347 ms ± 4.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:08:29 Samples: 64\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.357 CPU time: 0.361\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-7232d267-3e1d-4c9d-aeeb-5254ca0449dc.json\n", + "\n", + "0.357 ../../../tmp/ipykernel_2806375/2991494264.py:2\n", + "`- 0.357 func ../../../tmp/ipykernel_2806375/2661685993.py:1\n", + " `- 0.357 ccc ccc/coef/impl.py:308\n", + " `- 0.357 compute_coef ccc/coef/impl.py:494\n", + " `- 0.357 cdist_func ccc/coef/impl.py:487\n", + " `- 0.357 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 0.353 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.353 lock.acquire \n", + " `- 0.004 ccc/coef/impl.py:211\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "767 ms ± 7.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:08:42 Samples: 96\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.783 CPU time: 0.788\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-7232d267-3e1d-4c9d-aeeb-5254ca0449dc.json\n", + "\n", + "0.783 ../../../tmp/ipykernel_2806375/2991494264.py:2\n", + "`- 0.783 func ../../../tmp/ipykernel_2806375/2661685993.py:1\n", + " `- 0.783 ccc ccc/coef/impl.py:308\n", + " `- 0.779 compute_coef ccc/coef/impl.py:494\n", + " `- 0.779 cdist_func ccc/coef/impl.py:487\n", + " `- 0.779 cdist_parts_parallel ccc/coef/impl.py:193\n", + " `- 0.778 as_completed concurrent/futures/_base.py:201\n", + " [4 frames hidden] concurrent, threading, \n", + " 0.777 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "786 ms ± 9.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:08:55 Samples: 100\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.805 CPU time: 0.810\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-7232d267-3e1d-4c9d-aeeb-5254ca0449dc.json\n", + "\n", + "0.805 ../../../tmp/ipykernel_2806375/2991494264.py:2\n", + "`- 0.805 func ../../../tmp/ipykernel_2806375/2661685993.py:1\n", + " `- 0.805 ccc ccc/coef/impl.py:308\n", + " |- 0.793 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.793 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.793 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 0.791 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.791 lock.acquire \n", + " `- 0.012 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "809 ms ± 3.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:09:09 Samples: 100\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.825 CPU time: 0.828\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-7232d267-3e1d-4c9d-aeeb-5254ca0449dc.json\n", + "\n", + "0.824 ../../../tmp/ipykernel_2806375/2991494264.py:2\n", + "`- 0.824 func ../../../tmp/ipykernel_2806375/2661685993.py:1\n", + " `- 0.824 ccc ccc/coef/impl.py:308\n", + " |- 0.809 compute_coef ccc/coef/impl.py:494\n", + " | `- 0.809 cdist_func ccc/coef/impl.py:487\n", + " | `- 0.809 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 0.809 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.809 lock.acquire \n", + " `- 0.016 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4.16 s ± 9.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:10:17 Samples: 106\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 4.176 CPU time: 4.194\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-7232d267-3e1d-4c9d-aeeb-5254ca0449dc.json\n", + "\n", + "4.176 ../../../tmp/ipykernel_2806375/2991494264.py:2\n", + "`- 4.176 func ../../../tmp/ipykernel_2806375/2661685993.py:1\n", + " `- 4.176 ccc ccc/coef/impl.py:308\n", + " |- 3.320 compute_coef ccc/coef/impl.py:494\n", + " | `- 3.320 cdist_func ccc/coef/impl.py:487\n", + " | `- 3.320 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 3.320 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 3.320 lock.acquire \n", + " `- 0.849 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + " 0.849 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.73 s ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:12:25 Samples: 118\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 7.721 CPU time: 7.748\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-7232d267-3e1d-4c9d-aeeb-5254ca0449dc.json\n", + "\n", + "7.721 ../../../tmp/ipykernel_2806375/2991494264.py:2\n", + "`- 7.721 func ../../../tmp/ipykernel_2806375/2661685993.py:1\n", + " `- 7.705 ccc ccc/coef/impl.py:308\n", + " |- 5.907 compute_coef ccc/coef/impl.py:494\n", + " | `- 5.906 cdist_func ccc/coef/impl.py:487\n", + " | `- 5.906 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 5.901 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 5.901 lock.acquire \n", + " `- 1.796 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + " 1.796 lock.acquire \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_24_cpu_Core copy 2.ipynb b/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_24_cpu_Core copy 2.ipynb new file mode 100644 index 00000000..5e43c37b --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_24_cpu_Core copy 2.ipynb @@ -0,0 +1,1503 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Single-threaded version, force to use `cdist_parts_basic` to see the overhead of `air` " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5c010e62", + "metadata": {}, + "outputs": [], + "source": [ + "# Set core count to 1\n", + "# %env CM_N_JOBS=1" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/sklearn/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/scipy/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/coef/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/utils/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/pytorch/__pycache__\n" + ] + } + ], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2024-06-28 11:47:39,657 - numba.cuda.cudadrv.driver] INFO: init\n", + "[2024-06-28 11:47:39,755 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:47:39,755 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:47:39,755 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,755 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,756 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:47:39,828 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:47:39,828 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:47:39,830 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:47:39,830 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:47:39,831 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,831 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,831 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:47:39,831 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:47:39,832 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,832 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,832 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:47:39,832 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:47:39,832 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:47:39,833 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:47:39,833 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:47:39,833 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,833 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,833 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:47:39,834 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:47:39,834 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:47:39,835 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:47:39,836 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:47:39,836 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,836 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,836 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:47:39,837 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:47:39,837 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:47:39,838 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:47:39,838 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:47:39,839 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:47:39,839 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:47:39,839 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:47:39,839 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:47:39,839 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,840 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,840 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:47:39,840 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:47:39,840 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:47:39,840 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:47:39,841 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:47:39,841 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,841 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:47:39,841 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-28 11:47:39,842 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-28 11:47:39,842 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from ccc.coef import ccc\n", + "%load_ext pyinstrument" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10\n", + "np.random.seed(0)\n", + "\n", + "# Disable numba logging\n", + "import logging\n", + "\n", + "numba_logger = logging.getLogger('numba')\n", + "numba_logger.setLevel(logging.WARNING)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bc5236a7", + "metadata": {}, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=24)" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "865 ms ± 13.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:47:54 Samples: 57\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.855 CPU time: 1.033\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-cd684189-5184-4139-b799-41e3f2597cfb.json\n", + "\n", + "0.854 ../../../tmp/ipykernel_2868492/2991494264.py:2\n", + "`- 0.854 func ../../../tmp/ipykernel_2868492/2287242631.py:1\n", + " `- 0.854 ccc ccc/coef/impl.py:308\n", + " `- 0.852 compute_coef ccc/coef/impl.py:494\n", + " `- 0.851 cdist_func ccc/coef/impl.py:487\n", + " `- 0.851 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 0.817 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.814 lock.acquire \n", + " `- 0.034 ccc/coef/impl.py:211\n", + " `- 0.032 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.96 s ± 20.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:48:26 Samples: 83\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 2.042 CPU time: 2.476\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-cd684189-5184-4139-b799-41e3f2597cfb.json\n", + "\n", + "2.042 ../../../tmp/ipykernel_2868492/2991494264.py:2\n", + "`- 2.042 func ../../../tmp/ipykernel_2868492/2287242631.py:1\n", + " `- 2.042 ccc ccc/coef/impl.py:308\n", + " `- 2.034 compute_coef ccc/coef/impl.py:494\n", + " `- 2.032 cdist_func ccc/coef/impl.py:487\n", + " `- 2.032 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 1.986 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 1.986 lock.acquire \n", + " `- 0.046 ccc/coef/impl.py:211\n", + " `- 0.046 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2 s ± 17.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:49:00 Samples: 85\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 2.008 CPU time: 2.430\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-cd684189-5184-4139-b799-41e3f2597cfb.json\n", + "\n", + "2.008 ../../../tmp/ipykernel_2868492/2991494264.py:2\n", + "`- 2.008 func ../../../tmp/ipykernel_2868492/2287242631.py:1\n", + " `- 2.007 ccc ccc/coef/impl.py:308\n", + " `- 1.996 compute_coef ccc/coef/impl.py:494\n", + " `- 1.996 cdist_func ccc/coef/impl.py:487\n", + " `- 1.996 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 1.959 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 1.958 lock.acquire \n", + " `- 0.038 ccc/coef/impl.py:211\n", + " `- 0.037 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.01 s ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:49:34 Samples: 89\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 1.982 CPU time: 2.440\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-cd684189-5184-4139-b799-41e3f2597cfb.json\n", + "\n", + "1.981 ../../../tmp/ipykernel_2868492/2991494264.py:2\n", + "`- 1.981 func ../../../tmp/ipykernel_2868492/2287242631.py:1\n", + " `- 1.980 ccc ccc/coef/impl.py:308\n", + " `- 1.965 compute_coef ccc/coef/impl.py:494\n", + " `- 1.965 cdist_func ccc/coef/impl.py:487\n", + " `- 1.965 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 1.921 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 1.920 lock.acquire \n", + " `- 0.043 ccc/coef/impl.py:211\n", + " `- 0.043 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.98 s ± 30.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:50:40 Samples: 97\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 4.035 CPU time: 6.229\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-cd684189-5184-4139-b799-41e3f2597cfb.json\n", + "\n", + "4.035 ../../../tmp/ipykernel_2868492/2991494264.py:2\n", + "`- 4.035 func ../../../tmp/ipykernel_2868492/2287242631.py:1\n", + " `- 4.024 ccc ccc/coef/impl.py:308\n", + " |- 3.542 compute_coef ccc/coef/impl.py:494\n", + " | `- 3.541 cdist_func ccc/coef/impl.py:487\n", + " | `- 3.541 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 3.524 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 3.524 lock.acquire \n", + " `- 0.467 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6.61 s ± 50.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:52:30 Samples: 97\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 6.613 CPU time: 10.182\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-cd684189-5184-4139-b799-41e3f2597cfb.json\n", + "\n", + "6.613 ../../../tmp/ipykernel_2868492/2991494264.py:2\n", + "`- 6.613 func ../../../tmp/ipykernel_2868492/2287242631.py:1\n", + " `- 6.597 ccc ccc/coef/impl.py:308\n", + " |- 5.609 compute_coef ccc/coef/impl.py:494\n", + " | `- 5.609 cdist_func ccc/coef/impl.py:487\n", + " | `- 5.609 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 5.593 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 5.592 lock.acquire \n", + " `- 0.969 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_6_cpu_Core copy.ipynb b/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_6_cpu_Core copy.ipynb new file mode 100644 index 00000000..e1e05403 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/03_gpu_version_more_profilers/00_cuda_ari_6_cpu_Core copy.ipynb @@ -0,0 +1,1503 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36c65ddb-8246-4b0c-a231-68a373acc2cf", + "metadata": { + "papermill": { + "duration": 0.095646, + "end_time": "2021-12-02T04:34:06.384775", + "exception": false, + "start_time": "2021-12-02T04:34:06.289129", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "337633a8-d03e-4509-b89d-f8daee598958", + "metadata": { + "papermill": { + "duration": 0.091407, + "end_time": "2021-12-02T04:34:06.566258", + "exception": false, + "start_time": "2021-12-02T04:34:06.474851", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Single-threaded version, force to use `cdist_parts_basic` to see the overhead of `air` " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5c010e62", + "metadata": {}, + "outputs": [], + "source": [ + "# Set core count to 1\n", + "# %env CM_N_JOBS=1" + ] + }, + { + "cell_type": "markdown", + "id": "63c2b099-cc44-4fe2-93d1-40336e0a8466", + "metadata": { + "papermill": { + "duration": 0.09056, + "end_time": "2021-12-02T04:34:06.747753", + "exception": false, + "start_time": "2021-12-02T04:34:06.657193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Remove pycache dir" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "960c4ff0-2a73-4eaa-97d6-3269102233eb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:06.942330Z", + "iopub.status.busy": "2021-12-02T04:34:06.941822Z", + "iopub.status.idle": "2021-12-02T04:34:07.534005Z", + "shell.execute_reply": "2021-12-02T04:34:07.531916Z" + }, + "papermill": { + "duration": 0.696141, + "end_time": "2021-12-02T04:34:07.534420", + "exception": false, + "start_time": "2021-12-02T04:34:06.838279", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu\n" + ] + } + ], + "source": [ + "!echo ${CODE_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e18db58a-316a-445b-a376-8b2ec18e08d8", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:07.733067Z", + "iopub.status.busy": "2021-12-02T04:34:07.732593Z", + "iopub.status.idle": "2021-12-02T04:34:08.445221Z", + "shell.execute_reply": "2021-12-02T04:34:08.443302Z" + }, + "papermill": { + "duration": 0.817887, + "end_time": "2021-12-02T04:34:08.445598", + "exception": false, + "start_time": "2021-12-02T04:34:07.627711", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/sklearn/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/scipy/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/coef/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/utils/__pycache__\n", + "/home/haoyu/_database/projs/ccc-gpu/libs/ccc/pytorch/__pycache__\n" + ] + } + ], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b54099b0-a990-4bbd-bcbd-e206eb0f0f0e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:08.668700Z", + "iopub.status.busy": "2021-12-02T04:34:08.668226Z", + "iopub.status.idle": "2021-12-02T04:34:09.290588Z", + "shell.execute_reply": "2021-12-02T04:34:09.288752Z" + }, + "papermill": { + "duration": 0.719545, + "end_time": "2021-12-02T04:34:09.290958", + "exception": false, + "start_time": "2021-12-02T04:34:08.571413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -prune -exec rm -rf {} \\;" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7a9a8098-8160-46bf-8d83-bc398cbe2382", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:09.513928Z", + "iopub.status.busy": "2021-12-02T04:34:09.513465Z", + "iopub.status.idle": "2021-12-02T04:34:10.120602Z", + "shell.execute_reply": "2021-12-02T04:34:10.118684Z" + }, + "papermill": { + "duration": 0.704384, + "end_time": "2021-12-02T04:34:10.120972", + "exception": false, + "start_time": "2021-12-02T04:34:09.416588", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!find ${CODE_DIR} -regex '^.*\\(__pycache__\\)$' -print" + ] + }, + { + "cell_type": "markdown", + "id": "c2251313-41ac-46fd-a845-0f209689ecf6", + "metadata": { + "papermill": { + "duration": 0.093051, + "end_time": "2021-12-02T04:34:10.338738", + "exception": false, + "start_time": "2021-12-02T04:34:10.245687", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "987ef5f1-be49-4a6c-a4f4-b24a0a2094cb", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:10.528319Z", + "iopub.status.busy": "2021-12-02T04:34:10.527833Z", + "iopub.status.idle": "2021-12-02T04:34:10.845421Z", + "shell.execute_reply": "2021-12-02T04:34:10.845020Z" + }, + "papermill": { + "duration": 0.414249, + "end_time": "2021-12-02T04:34:10.845518", + "exception": false, + "start_time": "2021-12-02T04:34:10.431269", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2024-06-28 11:32:00,811 - numba.cuda.cudadrv.driver] INFO: init\n", + "[2024-06-28 11:32:00,930 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:32:00,931 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:32:00,931 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:00,931 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:00,931 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:32:01,005 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:32:01,005 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:32:01,007 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:32:01,007 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:32:01,007 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,007 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,007 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:32:01,008 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:32:01,008 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,008 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,008 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:32:01,008 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 16 bytes\n", + "[2024-06-28 11:32:01,009 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:32:01,009 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:32:01,009 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:32:01,009 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,009 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,009 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:32:01,010 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:32:01,010 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:32:01,012 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:32:01,012 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:32:01,012 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,013 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,013 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:32:01,014 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:32:01,014 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:32:01,015 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:32:01,015 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:32:01,015 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:32:01,015 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:32:01,016 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 4 bytes\n", + "[2024-06-28 11:32:01,016 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:32:01,016 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,016 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,016 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:32:01,016 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 24 bytes\n", + "[2024-06-28 11:32:01,017 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 8 bytes\n", + "[2024-06-28 11:32:01,017 - numba.cuda.cudadrv.driver] INFO: dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:32:01,017 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 6 bytes\n", + "[2024-06-28 11:32:01,017 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,018 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 20 bytes\n", + "[2024-06-28 11:32:01,018 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-28 11:32:01,019 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 36 bytes\n", + "[2024-06-28 11:32:01,019 - numba.cuda.cudadrv.driver] INFO: add pending dealloc: cuMemFree_v2 8 bytes\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from ccc.coef import ccc\n", + "%load_ext pyinstrument" + ] + }, + { + "cell_type": "markdown", + "id": "24399ccb-d33d-4bad-9baf-638c9c56feb2", + "metadata": { + "papermill": { + "duration": 0.095359, + "end_time": "2021-12-02T04:34:11.037941", + "exception": false, + "start_time": "2021-12-02T04:34:10.942582", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c609cefa-f513-4cf8-9573-367744e31c5f", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.224902Z", + "iopub.status.busy": "2021-12-02T04:34:11.224429Z", + "iopub.status.idle": "2021-12-02T04:34:11.226352Z", + "shell.execute_reply": "2021-12-02T04:34:11.226694Z" + }, + "papermill": { + "duration": 0.097459, + "end_time": "2021-12-02T04:34:11.226809", + "exception": false, + "start_time": "2021-12-02T04:34:11.129350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_REPS = 10\n", + "np.random.seed(0)\n", + "\n", + "# Disable numba logging\n", + "import logging\n", + "\n", + "numba_logger = logging.getLogger('numba')\n", + "numba_logger.setLevel(logging.WARNING)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd3067b-a4f7-475e-9575-20246934537d", + "metadata": { + "papermill": { + "duration": 0.092004, + "end_time": "2021-12-02T04:34:11.602824", + "exception": false, + "start_time": "2021-12-02T04:34:11.510820", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "02e0507c-43ff-4693-8a3b-8ccd8f23168c", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:11.790398Z", + "iopub.status.busy": "2021-12-02T04:34:11.789933Z", + "iopub.status.idle": "2021-12-02T04:34:20.454299Z", + "shell.execute_reply": "2021-12-02T04:34:20.453925Z" + }, + "papermill": { + "duration": 8.760307, + "end_time": "2021-12-02T04:34:20.454396", + "exception": false, + "start_time": "2021-12-02T04:34:11.694089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.15625" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let numba compile all the code before profiling\n", + "ccc(np.random.rand(10), np.random.rand(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bc5236a7", + "metadata": {}, + "outputs": [], + "source": [ + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=6)" + ] + }, + { + "cell_type": "markdown", + "id": "8549179d-1517-4a40-9a51-b95dc02d0fcc", + "metadata": { + "papermill": { + "duration": 0.092955, + "end_time": "2021-12-02T04:34:20.640996", + "exception": false, + "start_time": "2021-12-02T04:34:20.548041", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` small" + ] + }, + { + "cell_type": "markdown", + "id": "13ba811b", + "metadata": { + "papermill": { + "duration": 0.092926, + "end_time": "2021-12-02T04:34:20.826776", + "exception": false, + "start_time": "2021-12-02T04:34:20.733850", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "68064f0b", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.026855Z", + "iopub.status.busy": "2021-12-02T04:34:21.026243Z", + "iopub.status.idle": "2021-12-02T04:34:21.028562Z", + "shell.execute_reply": "2021-12-02T04:34:21.027971Z" + }, + "papermill": { + "duration": 0.107158, + "end_time": "2021-12-02T04:34:21.028689", + "exception": false, + "start_time": "2021-12-02T04:34:20.921531", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6f2ff213-d6b7-458f-acbf-8c73dd497a2a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.225799Z", + "iopub.status.busy": "2021-12-02T04:34:21.225350Z", + "iopub.status.idle": "2021-12-02T04:34:21.226800Z", + "shell.execute_reply": "2021-12-02T04:34:21.227141Z" + }, + "papermill": { + "duration": 0.09836, + "end_time": "2021-12-02T04:34:21.227254", + "exception": false, + "start_time": "2021-12-02T04:34:21.128894", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c0abc65a-2f3c-4476-9c2a-f8d7753b75e6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:21.614745Z", + "iopub.status.busy": "2021-12-02T04:34:21.614223Z", + "iopub.status.idle": "2021-12-02T04:34:33.925655Z", + "shell.execute_reply": "2021-12-02T04:34:33.925209Z" + }, + "papermill": { + "duration": 12.410211, + "end_time": "2021-12-02T04:34:33.925764", + "exception": false, + "start_time": "2021-12-02T04:34:21.515553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "847 ms ± 24.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e80a7b02-310f-4bd9-90d5-2a41186db39e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.125850Z", + "iopub.status.busy": "2021-12-02T04:34:34.125263Z", + "iopub.status.idle": "2021-12-02T04:34:34.148529Z", + "shell.execute_reply": "2021-12-02T04:34:34.148886Z" + }, + "papermill": { + "duration": 0.124845, + "end_time": "2021-12-02T04:34:34.149006", + "exception": false, + "start_time": "2021-12-02T04:34:34.024161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:32:14 Samples: 68\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 0.863 CPU time: 1.045\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-03331e06-fc00-41d5-a7eb-285b28b98172.json\n", + "\n", + "0.862 ../../../tmp/ipykernel_2841216/2991494264.py:2\n", + "`- 0.862 func ../../../tmp/ipykernel_2841216/1687822962.py:1\n", + " `- 0.862 ccc ccc/coef/impl.py:308\n", + " `- 0.860 compute_coef ccc/coef/impl.py:494\n", + " `- 0.859 cdist_func ccc/coef/impl.py:487\n", + " `- 0.859 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 0.808 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 0.807 lock.acquire \n", + " `- 0.051 ccc/coef/impl.py:211\n", + " `- 0.051 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "2548440c", + "metadata": { + "papermill": { + "duration": 0.094866, + "end_time": "2021-12-02T04:34:34.338010", + "exception": false, + "start_time": "2021-12-02T04:34:34.243144", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ddb768d7-9b74-424c-bdb9-9e52b9e5b181", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.530550Z", + "iopub.status.busy": "2021-12-02T04:34:34.530085Z", + "iopub.status.idle": "2021-12-02T04:34:34.531559Z", + "shell.execute_reply": "2021-12-02T04:34:34.531910Z" + }, + "papermill": { + "duration": 0.099465, + "end_time": "2021-12-02T04:34:34.532025", + "exception": false, + "start_time": "2021-12-02T04:34:34.432560", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "161cf922-41f7-4ced-af1f-e3d25b36b200", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:34.723894Z", + "iopub.status.busy": "2021-12-02T04:34:34.723405Z", + "iopub.status.idle": "2021-12-02T04:34:34.725017Z", + "shell.execute_reply": "2021-12-02T04:34:34.725362Z" + }, + "papermill": { + "duration": 0.099541, + "end_time": "2021-12-02T04:34:34.725479", + "exception": false, + "start_time": "2021-12-02T04:34:34.625938", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0a7f2b1f-4b87-4dde-a46b-2acec5ac93ba", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:35.120950Z", + "iopub.status.busy": "2021-12-02T04:34:35.120480Z", + "iopub.status.idle": "2021-12-02T04:34:37.984893Z", + "shell.execute_reply": "2021-12-02T04:34:37.985354Z" + }, + "papermill": { + "duration": 2.971389, + "end_time": "2021-12-02T04:34:37.985494", + "exception": false, + "start_time": "2021-12-02T04:34:35.014105", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.71 s ± 28.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "74acbe27-9807-4f26-8b7e-b74d0f745b63", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.190471Z", + "iopub.status.busy": "2021-12-02T04:34:38.189361Z", + "iopub.status.idle": "2021-12-02T04:34:38.227298Z", + "shell.execute_reply": "2021-12-02T04:34:38.227692Z" + }, + "papermill": { + "duration": 0.13744, + "end_time": "2021-12-02T04:34:38.227812", + "exception": false, + "start_time": "2021-12-02T04:34:38.090372", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:32:43 Samples: 103\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 1.793 CPU time: 2.167\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-03331e06-fc00-41d5-a7eb-285b28b98172.json\n", + "\n", + "1.792 ../../../tmp/ipykernel_2841216/2991494264.py:2\n", + "`- 1.792 func ../../../tmp/ipykernel_2841216/1687822962.py:1\n", + " `- 1.792 ccc ccc/coef/impl.py:308\n", + " `- 1.787 compute_coef ccc/coef/impl.py:494\n", + " `- 1.785 cdist_func ccc/coef/impl.py:487\n", + " `- 1.785 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 1.744 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 1.743 lock.acquire \n", + " `- 0.041 ccc/coef/impl.py:211\n", + " `- 0.040 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "611ff8e1", + "metadata": { + "papermill": { + "duration": 0.09562, + "end_time": "2021-12-02T04:34:38.420643", + "exception": false, + "start_time": "2021-12-02T04:34:38.325023", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=500`" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4bcf4b42", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.617318Z", + "iopub.status.busy": "2021-12-02T04:34:38.616836Z", + "iopub.status.idle": "2021-12-02T04:34:38.618469Z", + "shell.execute_reply": "2021-12-02T04:34:38.618817Z" + }, + "papermill": { + "duration": 0.101998, + "end_time": "2021-12-02T04:34:38.618933", + "exception": false, + "start_time": "2021-12-02T04:34:38.516935", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 500" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0bf2f21e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:38.813589Z", + "iopub.status.busy": "2021-12-02T04:34:38.813117Z", + "iopub.status.idle": "2021-12-02T04:34:38.815386Z", + "shell.execute_reply": "2021-12-02T04:34:38.815020Z" + }, + "papermill": { + "duration": 0.100616, + "end_time": "2021-12-02T04:34:38.815484", + "exception": false, + "start_time": "2021-12-02T04:34:38.714868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "cbde4ce6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:39.216775Z", + "iopub.status.busy": "2021-12-02T04:34:39.216014Z", + "iopub.status.idle": "2021-12-02T04:34:43.223179Z", + "shell.execute_reply": "2021-12-02T04:34:43.223640Z" + }, + "papermill": { + "duration": 4.108094, + "end_time": "2021-12-02T04:34:43.223780", + "exception": false, + "start_time": "2021-12-02T04:34:39.115686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.73 s ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1250547e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.427169Z", + "iopub.status.busy": "2021-12-02T04:34:43.426487Z", + "iopub.status.idle": "2021-12-02T04:34:43.474288Z", + "shell.execute_reply": "2021-12-02T04:34:43.473832Z" + }, + "papermill": { + "duration": 0.148908, + "end_time": "2021-12-02T04:34:43.474385", + "exception": false, + "start_time": "2021-12-02T04:34:43.325477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:33:13 Samples: 102\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 1.743 CPU time: 2.118\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-03331e06-fc00-41d5-a7eb-285b28b98172.json\n", + "\n", + "1.742 ../../../tmp/ipykernel_2841216/2991494264.py:2\n", + "`- 1.742 func ../../../tmp/ipykernel_2841216/1687822962.py:1\n", + " `- 1.742 ccc ccc/coef/impl.py:308\n", + " `- 1.731 compute_coef ccc/coef/impl.py:494\n", + " `- 1.730 cdist_func ccc/coef/impl.py:487\n", + " `- 1.730 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 1.696 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 1.696 lock.acquire \n", + " `- 0.030 ccc/coef/impl.py:211\n", + " `- 0.030 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "6853c300", + "metadata": { + "papermill": { + "duration": 0.09707, + "end_time": "2021-12-02T04:34:43.669776", + "exception": false, + "start_time": "2021-12-02T04:34:43.572706", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f77e8490", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:43.871037Z", + "iopub.status.busy": "2021-12-02T04:34:43.870573Z", + "iopub.status.idle": "2021-12-02T04:34:43.872099Z", + "shell.execute_reply": "2021-12-02T04:34:43.872442Z" + }, + "papermill": { + "duration": 0.103066, + "end_time": "2021-12-02T04:34:43.872558", + "exception": false, + "start_time": "2021-12-02T04:34:43.769492", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c99f544a", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.102128Z", + "iopub.status.busy": "2021-12-02T04:34:44.101540Z", + "iopub.status.idle": "2021-12-02T04:34:44.103376Z", + "shell.execute_reply": "2021-12-02T04:34:44.103817Z" + }, + "papermill": { + "duration": 0.13265, + "end_time": "2021-12-02T04:34:44.103967", + "exception": false, + "start_time": "2021-12-02T04:34:43.971317", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9721b048", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:44.515965Z", + "iopub.status.busy": "2021-12-02T04:34:44.515462Z", + "iopub.status.idle": "2021-12-02T04:34:50.907897Z", + "shell.execute_reply": "2021-12-02T04:34:50.908362Z" + }, + "papermill": { + "duration": 6.496377, + "end_time": "2021-12-02T04:34:50.908503", + "exception": false, + "start_time": "2021-12-02T04:34:44.412126", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.76 s ± 18.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fd0f4dd6", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.122723Z", + "iopub.status.busy": "2021-12-02T04:34:51.122244Z", + "iopub.status.idle": "2021-12-02T04:34:51.194219Z", + "shell.execute_reply": "2021-12-02T04:34:51.193813Z" + }, + "papermill": { + "duration": 0.179898, + "end_time": "2021-12-02T04:34:51.194319", + "exception": false, + "start_time": "2021-12-02T04:34:51.014421", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:33:42 Samples: 99\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 1.772 CPU time: 2.171\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-03331e06-fc00-41d5-a7eb-285b28b98172.json\n", + "\n", + "1.771 ../../../tmp/ipykernel_2841216/2991494264.py:2\n", + "`- 1.771 func ../../../tmp/ipykernel_2841216/1687822962.py:1\n", + " `- 1.771 ccc ccc/coef/impl.py:308\n", + " `- 1.759 compute_coef ccc/coef/impl.py:494\n", + " `- 1.759 cdist_func ccc/coef/impl.py:487\n", + " `- 1.759 cdist_parts_parallel ccc/coef/impl.py:193\n", + " |- 1.738 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 1.736 lock.acquire \n", + " `- 0.021 ccc/coef/impl.py:211\n", + " `- 0.021 ThreadPoolExecutor.submit concurrent/futures/thread.py:161\n", + " [6 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "f6b9adcf-1da4-4496-b05f-47952fd80d7f", + "metadata": { + "papermill": { + "duration": 0.099861, + "end_time": "2021-12-02T04:34:51.394504", + "exception": false, + "start_time": "2021-12-02T04:34:51.294643", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run with `n_samples` large" + ] + }, + { + "cell_type": "markdown", + "id": "8f2e407c", + "metadata": { + "papermill": { + "duration": 0.098767, + "end_time": "2021-12-02T04:34:51.593072", + "exception": false, + "start_time": "2021-12-02T04:34:51.494305", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=50000`" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c522396e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.793258Z", + "iopub.status.busy": "2021-12-02T04:34:51.792385Z", + "iopub.status.idle": "2021-12-02T04:34:51.794710Z", + "shell.execute_reply": "2021-12-02T04:34:51.795054Z" + }, + "papermill": { + "duration": 0.103749, + "end_time": "2021-12-02T04:34:51.795172", + "exception": false, + "start_time": "2021-12-02T04:34:51.691423", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 50000" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a5e536cc", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:51.996666Z", + "iopub.status.busy": "2021-12-02T04:34:51.996120Z", + "iopub.status.idle": "2021-12-02T04:34:51.999329Z", + "shell.execute_reply": "2021-12-02T04:34:51.998896Z" + }, + "papermill": { + "duration": 0.104554, + "end_time": "2021-12-02T04:34:51.999423", + "exception": false, + "start_time": "2021-12-02T04:34:51.894869", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "91470f64", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:34:52.408864Z", + "iopub.status.busy": "2021-12-02T04:34:52.408310Z", + "iopub.status.idle": "2021-12-02T04:35:28.383597Z", + "shell.execute_reply": "2021-12-02T04:35:28.384010Z" + }, + "papermill": { + "duration": 36.078113, + "end_time": "2021-12-02T04:35:28.384125", + "exception": false, + "start_time": "2021-12-02T04:34:52.306012", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.9 s ± 19.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4de4e0b0", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:28.594907Z", + "iopub.status.busy": "2021-12-02T04:35:28.594395Z", + "iopub.status.idle": "2021-12-02T04:35:30.850079Z", + "shell.execute_reply": "2021-12-02T04:35:30.850466Z" + }, + "papermill": { + "duration": 2.36055, + "end_time": "2021-12-02T04:35:30.850581", + "exception": false, + "start_time": "2021-12-02T04:35:28.490031", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:34:47 Samples: 115\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 3.939 CPU time: 5.797\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-03331e06-fc00-41d5-a7eb-285b28b98172.json\n", + "\n", + "3.938 ../../../tmp/ipykernel_2841216/2991494264.py:2\n", + "`- 3.938 func ../../../tmp/ipykernel_2841216/1687822962.py:1\n", + " `- 3.929 ccc ccc/coef/impl.py:308\n", + " |- 3.458 compute_coef ccc/coef/impl.py:494\n", + " | `- 3.457 cdist_func ccc/coef/impl.py:487\n", + " | `- 3.457 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 3.445 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 3.442 lock.acquire \n", + " `- 0.459 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "markdown", + "id": "b0c07894", + "metadata": { + "papermill": { + "duration": 0.100749, + "end_time": "2021-12-02T04:35:31.053465", + "exception": false, + "start_time": "2021-12-02T04:35:30.952716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## `n_samples=100000`" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "7fb2ab2e-a6de-412b-9540-d00f8641290e", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.262806Z", + "iopub.status.busy": "2021-12-02T04:35:31.262280Z", + "iopub.status.idle": "2021-12-02T04:35:31.264634Z", + "shell.execute_reply": "2021-12-02T04:35:31.264124Z" + }, + "papermill": { + "duration": 0.110468, + "end_time": "2021-12-02T04:35:31.264738", + "exception": false, + "start_time": "2021-12-02T04:35:31.154270", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "N_SAMPLES = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "81765e91", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.472530Z", + "iopub.status.busy": "2021-12-02T04:35:31.472083Z", + "iopub.status.idle": "2021-12-02T04:35:31.475862Z", + "shell.execute_reply": "2021-12-02T04:35:31.475424Z" + }, + "papermill": { + "duration": 0.107444, + "end_time": "2021-12-02T04:35:31.476016", + "exception": false, + "start_time": "2021-12-02T04:35:31.368572", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "aca57100", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:35:31.892045Z", + "iopub.status.busy": "2021-12-02T04:35:31.891417Z", + "iopub.status.idle": "2021-12-02T04:36:46.681345Z", + "shell.execute_reply": "2021-12-02T04:36:46.681695Z" + }, + "papermill": { + "duration": 74.896315, + "end_time": "2021-12-02T04:36:46.681806", + "exception": false, + "start_time": "2021-12-02T04:35:31.785491", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6.43 s ± 33.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit func()\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "b9c25f30", + "metadata": { + "execution": { + "iopub.execute_input": "2021-12-02T04:36:46.889124Z", + "iopub.status.busy": "2021-12-02T04:36:46.888441Z", + "iopub.status.idle": "2021-12-02T04:36:51.544747Z", + "shell.execute_reply": "2021-12-02T04:36:51.544315Z" + }, + "papermill": { + "duration": 4.761869, + "end_time": "2021-12-02T04:36:51.544839", + "exception": false, + "start_time": "2021-12-02T04:36:46.782970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 11:36:34 Samples: 111\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 6.452 CPU time: 9.550\n", + "/ _/ v4.6.2\n", + "\n", + "Program: /home/haoyu/.conda/envs/ccc/lib/python3.9/site-packages/ipykernel_launcher.py -f /home/haoyu/.local/share/jupyter/runtime/kernel-03331e06-fc00-41d5-a7eb-285b28b98172.json\n", + "\n", + "6.451 ../../../tmp/ipykernel_2841216/2991494264.py:2\n", + "`- 6.451 func ../../../tmp/ipykernel_2841216/1687822962.py:1\n", + " `- 6.437 ccc ccc/coef/impl.py:308\n", + " |- 5.452 compute_coef ccc/coef/impl.py:494\n", + " | `- 5.452 cdist_func ccc/coef/impl.py:487\n", + " | `- 5.452 cdist_parts_parallel ccc/coef/impl.py:193\n", + " | `- 5.444 as_completed concurrent/futures/_base.py:201\n", + " | [4 frames hidden] concurrent, threading, \n", + " | 5.443 lock.acquire \n", + " `- 0.965 result_iterator concurrent/futures/_base.py:602\n", + " [4 frames hidden] concurrent, threading, \n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%pyinstrument\n", + "func()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa76596-f126-4ba8-8bba-4d31225e0e5d", + "metadata": { + "papermill": { + "duration": 0.102208, + "end_time": "2021-12-02T04:36:51.764154", + "exception": false, + "start_time": "2021-12-02T04:36:51.661946", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + }, + "papermill": { + "default_parameters": {}, + "duration": 167.355469, + "end_time": "2021-12-02T04:36:52.275357", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.ipynb", + "output_path": "nbs/others/05_clustermatch_profiling/10_cm_optimized/09-cdist_parts_v04.run.ipynb", + "parameters": {}, + "start_time": "2021-12-02T04:34:04.919888", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/others/10_gpu_ari_profiling/99_scratch/profile0.ipynb b/nbs/others/10_gpu_ari_profiling/99_scratch/profile0.ipynb new file mode 100644 index 00000000..b067b02b --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/99_scratch/profile0.ipynb @@ -0,0 +1,239 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "from pathlib import Path\n", + "from IPython import get_ipython\n", + "\n", + "root_dir = Path(os.path.abspath('')) / 'nbs/others/10_gpu_ari_profiling'\n", + "common_dir = root_dir / 'common'\n", + "sys.path.append(str(common_dir))\n", + "\n", + "this_name = \"profile0\"\n", + "this_path = root_dir / '99_scratch' / this_name" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from utils import generate_bench_filename\n", + "from ccc.coef import ccc" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "# Disable Numba cuda info\n", + "import logging\n", + "\n", + "numba_logger = logging.getLogger('numba')\n", + "numba_logger.setLevel(logging.WARNING)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(0)\n", + "\n", + "N_REPS = 10\n", + "def func(n_reps=N_REPS):\n", + " for i in range(n_reps):\n", + " ccc(x, y, n_jobs=1)\n", + " \n", + "N_SAMPLES = 500000\n", + "x = np.random.rand(N_SAMPLES)\n", + "y = np.random.rand(N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "outfile = generate_bench_filename(this_path, \"n_samples\", N_SAMPLES)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "*** Profile printout saved to text file '/home/haoyu/_database/projs/ccc-gpu/nbs/others/10_gpu_ari_profiling/99_scratch/profile0_n_samples_500000.txt'. \n", + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 665 function calls in 39.215 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 39.215 39.215 39.215 39.215 {method 'enable' of '_lsprof.Profiler' objects}\n", + " 7 0.000 0.000 0.000 0.000 shlex.py:133(read_token)\n", + " 1 0.000 0.000 39.215 39.215 {built-in method builtins.exec}\n", + " 6 0.000 0.000 0.000 0.000 tokenize.py:429(_tokenize)\n", + " 22 0.000 0.000 0.000 0.000 traitlets.py:676(__get__)\n", + " 22 0.000 0.000 0.000 0.000 traitlets.py:629(get)\n", + " 1 0.000 0.000 0.000 0.000 inspect.py:2909(_bind)\n", + " 1 0.000 0.000 0.000 0.000 magic.py:621(parse_options)\n", + " 149 0.000 0.000 0.000 0.000 shlex.py:68(punctuation_chars)\n", + " 1 0.000 0.000 0.000 0.000 inputtransformer2.py:462(make_tokens_by_line)\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:255(find_handler)\n", + " 1 0.000 0.000 0.000 0.000 getopt.py:56(getopt)\n", + " 1 0.000 0.000 0.000 0.000 ipstruct.py:273(merge)\n", + " 1 0.000 0.000 0.000 0.000 interactiveshell.py:3275(transform_cell)\n", + " 1 0.000 0.000 0.000 0.000 inputtransformer2.py:579(transform_cell)\n", + " 134 0.000 0.000 0.000 0.000 {method 'read' of '_io.StringIO' objects}\n", + " 7 0.000 0.000 0.000 0.000 shlex.py:101(get_token)\n", + " 8 0.000 0.000 0.000 0.000 {method 'match' of 're.Pattern' objects}\n", + " 3 0.000 0.000 0.000 0.000 getopt.py:192(do_shorts)\n", + " 1 0.000 0.000 39.215 39.215 interactiveshell.py:2430(run_cell_magic)\n", + " 1 0.000 0.000 0.000 0.000 inspect.py:2648(args)\n", + " 15 0.000 0.000 0.000 0.000 {built-in method builtins.next}\n", + " 7 0.000 0.000 0.000 0.000 shlex.py:299(__next__)\n", + " 1 0.000 0.000 0.000 0.000 inputtransformer2.py:538(do_one_token_transform)\n", + " 1 0.000 0.000 39.215 39.215 execution.py:195(prun)\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:271(prefilter_line)\n", + " 1 0.000 0.000 0.000 0.000 inspect.py:2701(apply_defaults)\n", + " 1 0.000 0.000 39.215 39.215 execution.py:319(_run_with_profiler)\n", + " 1 0.000 0.000 0.000 0.000 splitinput.py:53(split_user_input)\n", + " 3 0.000 0.000 0.000 0.000 getopt.py:207(short_has_arg)\n", + " 1 0.000 0.000 0.000 0.000 _process_common.py:177(arg_split)\n", + " 4 0.000 0.000 0.000 0.000 re.py:289(_compile)\n", + " 1 0.000 0.000 0.000 0.000 decorator.py:199(fix)\n", + " 5 0.000 0.000 0.000 0.000 :1()\n", + " 4 0.000 0.000 0.000 0.000 inputtransformer2.py:108(_find_assign_op)\n", + " 1 0.000 0.000 0.000 0.000 shlex.py:21(__init__)\n", + " 1 0.000 0.000 0.000 0.000 inspect.py:2671(kwargs)\n", + " 1 0.000 0.000 0.000 0.000 getipython.py:17(get_ipython)\n", + " 4 0.000 0.000 0.000 0.000 types.py:171(__get__)\n", + " 44 0.000 0.000 0.000 0.000 typing.py:1375(cast)\n", + " 4 0.000 0.000 0.000 0.000 tokenize.py:98(_compile)\n", + " 1 0.000 0.000 39.215 39.215 :1()\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:314(prefilter_lines)\n", + " 4 0.000 0.000 0.000 0.000 re.py:250(compile)\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:458(check)\n", + " 1 0.000 0.000 0.000 0.000 interactiveshell.py:2481(find_cell_magic)\n", + " 2 0.000 0.000 0.000 0.000 ipstruct.py:41(__init__)\n", + " 1 0.000 0.000 39.215 39.215 decorator.py:229(fun)\n", + " 1 0.000 0.000 0.000 0.000 inputtransformer2.py:23(leading_empty_lines)\n", + " 13 0.000 0.000 0.000 0.000 {method 'startswith' of 'str' objects}\n", + " 1 0.000 0.000 0.000 0.000 splitinput.py:110(__init__)\n", + " 21 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1 0.000 0.000 0.000 0.000 inputtransformer2.py:36(leading_indent)\n", + " 1 0.000 0.000 0.000 0.000 inputtransformer2.py:216(find)\n", + " 1 0.000 0.000 0.000 0.000 inputtransformer2.py:570(do_token_transforms)\n", + " 1 0.000 0.000 0.000 0.000 inputtransformer2.py:368(find)\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:414(check)\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:482(check)\n", + " 4 0.000 0.000 0.000 0.000 ipstruct.py:66(__setitem__)\n", + " 15 0.000 0.000 0.000 0.000 inspect.py:2560(kind)\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:426(check)\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:246(prefilter_line_info)\n", + " 2 0.000 0.000 0.000 0.000 inputtransformer2.py:81(__call__)\n", + " 15 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 1 0.000 0.000 0.000 0.000 interactiveshell.py:2344(_find_with_lazy_load)\n", + " 1 0.000 0.000 0.000 0.000 inputtransformer2.py:248(find)\n", + " 1 0.000 0.000 0.000 0.000 interactiveshell.py:2487(find_magic)\n", + " 1 0.000 0.000 0.000 0.000 configurable.py:597(initialized)\n", + " 11 0.000 0.000 0.000 0.000 {method 'get' of 'dict' objects}\n", + " 1 0.000 0.000 0.000 0.000 inspect.py:3040(bind)\n", + " 9 0.000 0.000 0.000 0.000 {built-in method builtins.isinstance}\n", + " 1 0.000 0.000 39.215 39.215 magic.py:187()\n", + " 1 0.000 0.000 0.000 0.000 configurable.py:553(instance)\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:264(transform_line)\n", + " 1 0.000 0.000 39.215 39.215 cProfile.py:98(runctx)\n", + " 1 0.000 0.000 0.000 0.000 inputtransformer2.py:428(find)\n", + " 2 0.000 0.000 0.000 0.000 builtin_trap.py:39(__enter__)\n", + " 2 0.000 0.000 0.000 0.000 {method 'split' of 'str' objects}\n", + " 5 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x5d1ba0e40380}\n", + " 2 0.000 0.000 0.000 0.000 prefilter.py:234(get_handler_by_name)\n", + " 3 0.000 0.000 0.000 0.000 {method 'splitlines' of 'str' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method fromkeys}\n", + " 6 0.000 0.000 0.000 0.000 inspect.py:2548(name)\n", + " 1 0.000 0.000 0.000 0.000 encoding.py:21(get_stream_enc)\n", + " 1 0.000 0.000 0.000 0.000 inputtransformer2.py:96(cell_magic)\n", + " 4 0.000 0.000 0.000 0.000 enum.py:792(value)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 3 0.000 0.000 0.000 0.000 ipstruct.py:364()\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:183(checkers)\n", + " 4 0.000 0.000 0.000 0.000 inspect.py:2865(parameters)\n", + " 4 0.000 0.000 0.000 0.000 {method 'span' of 're.Match' objects}\n", + " 1 0.000 0.000 0.000 0.000 builtin_trap.py:46(__exit__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 3 0.000 0.000 0.000 0.000 {method 'items' of 'mappingproxy' objects}\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:549(handle)\n", + " 3 0.000 0.000 0.000 0.000 {built-in method builtins.iter}\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:440(check)\n", + " 1 0.000 0.000 0.000 0.000 {method 'groups' of 're.Match' objects}\n", + " 3 0.000 0.000 0.000 0.000 {method 'isidentifier' of 'str' objects}\n", + " 1 0.000 0.000 0.000 0.000 py3compat.py:26(cast_unicode)\n", + " 1 0.000 0.000 0.000 0.000 inspect.py:2640(__init__)\n", + " 3 0.000 0.000 0.000 0.000 {method 'join' of 'str' objects}\n", + " 1 0.000 0.000 0.000 0.000 tokenize.py:612(generate_tokens)\n", + " 2 0.000 0.000 0.000 0.000 {method 'isspace' of 'str' objects}\n", + " 3 0.000 0.000 0.000 0.000 {method 'strip' of 'str' objects}\n", + " 1 0.000 0.000 0.000 0.000 {method 'values' of 'mappingproxy' objects}\n", + " 1 0.000 0.000 0.000 0.000 {method 'endswith' of 'str' objects}\n", + " 1 0.000 0.000 0.000 0.000 prefilter.py:147(transformers)\n", + " 1 0.000 0.000 0.000 0.000 {method 'rstrip' of 'str' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method sys._getframe}\n", + " 1 0.000 0.000 0.000 0.000 {method 'lstrip' of 'str' objects}" + ] + } + ], + "source": [ + "# prun_cmd = f\"%%prun -s cumulative -l 25 -T {outfile}\"\n", + "# get_ipython().run_cell_magic('prun', prun_cmd, 'func()')\n", + "# Use cProfile instead of prun magic" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/others/10_gpu_ari_profiling/99_scratch/profile0_n_samples_500000.txt b/nbs/others/10_gpu_ari_profiling/99_scratch/profile0_n_samples_500000.txt new file mode 100644 index 00000000..aa28c801 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/99_scratch/profile0_n_samples_500000.txt @@ -0,0 +1,31 @@ + 8334 function calls in 39.213 seconds + + Ordered by: cumulative time + List reduced from 113 to 25 due to restriction <25> + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 39.213 39.213 {built-in method builtins.exec} + 1 0.000 0.000 39.213 39.213 :1() + 1 0.007 0.007 39.213 39.213 1858089180.py:4(func) + 10 0.047 0.005 39.206 3.921 impl.py:307(ccc) + 200 0.002 0.000 39.130 0.196 threading.py:280(wait) + 790 39.128 0.050 39.128 0.050 {method 'acquire' of '_thread.lock' objects} + 10 0.001 0.000 28.402 2.840 impl.py:492(compute_coef) + 10 0.000 0.000 28.401 2.840 impl.py:485(cdist_func) + 10 0.002 0.000 28.400 2.840 impl.py:192(cdist_parts_parallel) + 100 0.001 0.000 28.390 0.284 threading.py:563(wait) + 100 0.002 0.000 28.388 0.284 _base.py:201(as_completed) + 100 0.001 0.000 10.742 0.107 _base.py:418(result) + 20 0.000 0.000 10.742 0.537 _base.py:602(result_iterator) + 10 0.007 0.001 0.009 0.001 impl.py:210() + 100 0.000 0.000 0.008 0.000 thread.py:161(submit) + 100 0.000 0.000 0.007 0.000 thread.py:180(_adjust_thread_count) + 10 0.000 0.000 0.007 0.001 _base.py:573(map) + 10 0.000 0.000 0.007 0.001 _base.py:598() + 10 0.000 0.000 0.006 0.001 threading.py:880(start) + 50 0.005 0.000 0.005 0.000 {built-in method numpy.zeros} + 190 0.001 0.000 0.001 0.000 _base.py:179(_yield_finished_futures) + 20 0.001 0.000 0.001 0.000 impl.py:242(get_chunks) + 10 0.000 0.000 0.001 0.000 _base.py:636(__exit__) + 10 0.000 0.000 0.001 0.000 thread.py:216(shutdown) + 100 0.000 0.000 0.001 0.000 threading.py:411(acquire) \ No newline at end of file diff --git a/nbs/others/10_gpu_ari_profiling/README.md b/nbs/others/10_gpu_ari_profiling/README.md new file mode 100644 index 00000000..ad92427b --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/README.md @@ -0,0 +1,23 @@ +# CCC-GPU profiling + +This folder contains profiling results (with cProfile) of different +optimizations of the clustermatch code. A brief description of each subfolder is +below. + +- `00_cpu_version_ref`: + - Contains benchmarks of the CPU version of CCC (nbs/others/05_clustermatch_profiling/10_cm_optimized): + 1. Numba-enabled, multi-threaded + 2. Numba-disabled, multi-threaded + - Newly added: + 3. Numba-enabled, single-threaded + 4. Numba-disabled, single-threaded + + +* `01_ari_cuda_v0`: + - Contains benchmarks of the CUDA version of CCC, functions rewritten in CUDA: + - `ari` + +The tests were run on a System76 Thelio machine with the following specifications: +- 5.3 GHz Threadripper 7960X (24 Cores - 48 Threads) +- 256 GB ECC DDR5 4800 MHz (4x64) +- 24 GB NVIDIA GeForce RTX 4090 \ No newline at end of file diff --git a/nbs/others/10_gpu_ari_profiling/common/__init__.py b/nbs/others/10_gpu_ari_profiling/common/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nbs/others/10_gpu_ari_profiling/common/utils.py b/nbs/others/10_gpu_ari_profiling/common/utils.py new file mode 100644 index 00000000..5702c3e1 --- /dev/null +++ b/nbs/others/10_gpu_ari_profiling/common/utils.py @@ -0,0 +1,2 @@ +def generate_bench_filename(filename: str, var_name: str, var_value: int): + return f"{filename}_{var_name}_{var_value}.txt" diff --git a/old_CMakeLists.txt b/old_CMakeLists.txt new file mode 100644 index 00000000..492fe882 --- /dev/null +++ b/old_CMakeLists.txt @@ -0,0 +1,26 @@ +# Require CMake 3.15+ (matching scikit-build-core) Use new versions of all +# policies up to CMake 3.27 +cmake_minimum_required(VERSION 3.15...3.27) + +# Scikit-build-core sets these values for you, or you can just hard-code the +# name and version. +project( + ${SKBUILD_PROJECT_NAME} + VERSION ${SKBUILD_PROJECT_VERSION} + LANGUAGES CXX) + +# Find the module development requirements (requires FindPython from 3.17 or +# scikit-build-core's built-in backport) +find_package(Python REQUIRED COMPONENTS Interpreter Development.Module) +find_package(pybind11 CONFIG REQUIRED) + +# Add a library using FindPython's tooling (pybind11 also provides a helper like +# this) +python_add_library(_core MODULE libs/cuda_ext/example_binder.cpp libs/cuda_ext/example.cpp WITH_SOABI) +target_link_libraries(_core PRIVATE pybind11::headers) + +# This is passing in the version as a define just as an example +target_compile_definitions(_core PRIVATE VERSION_INFO=${PROJECT_VERSION}) + +# The install directory is the output (wheel) directory +install(TARGETS _core DESTINATION scikit_build_example) diff --git a/setup.cfg b/old_setup.cfg similarity index 100% rename from setup.cfg rename to old_setup.cfg diff --git a/setup.py b/old_setup.py similarity index 100% rename from setup.py rename to old_setup.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..04b75322 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,56 @@ +[build-system] +requires = [ + "scikit-build-core>=0.7.0", + "pybind11>=2.11.0", + "cmake>=3.15", + "ninja", + "setuptools>=42", + "wheel" +] +build-backend = "scikit_build_core.build" + +[project] +name = "cccgpu" +version = "0.2.0" +description = "The Clustermatch Correlation Coefficient (CCC) with GPU acceleration" +readme = "README.md" +requires-python = ">=3.9" +license = {text = "BSD-2-Clause Plus Patent"} +authors = [ + {name = "Milton Pividori", email = "miltondp@gmail.com"}, +] +dependencies = [ + "numpy>=1.21.0", + "scipy", + "numba", +] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Development Status :: 5 - Production/Stable", + "Environment :: Console", +] + +[project.optional-dependencies] +test = [ + "pytest", + "pytest-cov", +] + +[tool.scikit-build] +# Configure scikit-build-core +cmake.minimum-version = "3.15" +cmake.args = [ + "-DCMAKE_CUDA_ARCHITECTURES=75", # Adjust for your target CUDA architecture +] +cmake.verbose = true +wheel.packages = ["libs/ccc"] # Directory containing your Python packages +wheel.exclude = ["*.cpp", "*.h"] # Exclude C++ headers from wheel + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = "-ra -q" +testpaths = [ + "tests", +] diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh new file mode 100644 index 00000000..26180951 --- /dev/null +++ b/scripts/run_tests.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Run this script from the root of the repository: +# bash ./scripts/run_tests.sh + +# Setup environment +source ./scripts/setup_dev.sh + +# Install cccgpu with the cuda extension module +echo -e "\033[34mInstalling cccgpu with the cuda extension module...\033[0m" +pip install . + +# Run pytest +echo -e "\033[34mRunning Python tests...\033[0m" +pytest -rs --color=yes ./tests/ --ignore ./tests/gpu/excluded + +# Run C++ tests +echo -e "\033[34mBuilding C++ tests...\033[0m" +# Clean up build directory +rm -rf build +# TODO: fix `pip install .` for not generating the build directory +# Build the CUDA extension module +cmake -S . -B build +cmake --build build + +echo -e "\033[34mRunning C++ tests...\033[0m" +for test in ./build/test_*; do + echo "Running $test..." + ./$test +done + +# Uninstall cccgpu +echo -e "\033[34mUninstalling cccgpu...\033[0m" +pip uninstall cccgpu -y diff --git a/scripts/setup_dev.sh b/scripts/setup_dev.sh new file mode 100755 index 00000000..69f8065d --- /dev/null +++ b/scripts/setup_dev.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Used to setup the development environment for CCC +# Can be loaded by PyCharm on startup + +# Find the conda path +CONDA_PATH=$(conda info | grep -i 'base environment' | awk -F': ' '{print $2}' | awk '{print $1}') +source ${CONDA_PATH}/etc/profile.d/conda.sh + +# Activate the conda environment +conda activate ccc-gpu + +# Set the PYTHONPATH +export PYTHONPATH=`readlink -f ./libs/`:$PYTHONPATH + +# Set the CUDA_HOME and LD_LIBRARY_PATH +export LD_LIBRARY_PATH="~/anaconda3/envs/ccc-cuda/lib/:$LD_LIBRARY_PATH" +export LIBRARY_PATH="~/anaconda3/envs/ccc-cuda/lib/:$LD_LIBRARY_PATH" +export CUDA_HOME="~/anaconda3/envs/ccc-cuda" diff --git a/tests/cuda_ext/test_ari.cpp b/tests/cuda_ext/test_ari.cpp new file mode 100644 index 00000000..97ef4f63 --- /dev/null +++ b/tests/cuda_ext/test_ari.cpp @@ -0,0 +1,116 @@ +#include +#include +#include +#include "../../libs/ccc_cuda_ext/metrics.cuh" + +namespace py = pybind11; + +// Helper function to generate pairwise combinations (implement this according to your needs) + +std::vector, std::vector>> generate_pairwise_combinations(const std::vector>> &arr) +{ + std::vector, std::vector>> pairs; + size_t num_slices = arr.size(); // Number of 2D arrays in the 3D vector + for (size_t i = 0; i < num_slices; ++i) + { + for (size_t j = i + 1; j < num_slices; ++j) + { // Only consider pairs in different slices + for (const auto &row_i : arr[i]) + { // Each row in slice i + for (const auto &row_j : arr[j]) + { // Pairs with each row in slice j + pairs.emplace_back(row_i, row_j); + } + } + } + } + return pairs; +} + + +using Mat3 = std::vector>>; +using TestParamType = std::tuple; + +// Define a parameterized test fixture +class CudaAriTest : public ::testing::TestWithParam {}; + +TEST_P(CudaAriTest, CheckSingleResult) +{ + Mat3 parts; + float expected_result; + std::tie(parts, expected_result) = GetParam(); + + // Get dimensions + int n_features = parts.size(); + int n_parts = parts[0].size(); + int n_objs = parts[0][0].size(); + int n_feature_comp = n_features * (n_features - 1) / 2; + int n_aris = n_feature_comp * n_parts * n_parts; + std::cout << "n_features: " << n_features << ", n_parts: " << n_parts << ", n_objs: " << n_objs << std::endl + << "n_feature_comps: " << n_feature_comp << ", n_aris: " << n_aris << std::endl; + + // Allocate host memory for C-style array + int *h_parts = new int[n_features * n_parts * n_objs]; + + // Copy data from vector to C-style array + for (int i = 0; i < n_features; ++i) + { + for (int j = 0; j < n_parts; ++j) + { + for (int k = 0; k < n_objs; ++k) + { + h_parts[i * (n_parts * n_objs) + j * n_objs + k] = parts[i][j][k]; + } + } + } + + auto h_out = ari_core(h_parts, n_features, n_parts, n_objs)[0]; + + // Check if the result are close + EXPECT_NEAR(h_out, expected_result, 1e-2); +} + +// Instantiate the test suite with parameter values +// These tests are taken from sklearn.metrics.adjusted_rand_score: +// https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html +INSTANTIATE_TEST_SUITE_P( + CudaAriTestInstances, + CudaAriTest, + ::testing::Values( + TestParamType( + Mat3{ + {{0, 0, 1, 2}}, + {{0, 0, 1, 1}}, + }, + 0.57f + ), + TestParamType( + Mat3{ + {{0, 0, 1, 1}}, + {{0, 1, 0, 1}}, + }, + -0.5f + ), + TestParamType( + Mat3{ + {{0, 0, 1, 1}}, + {{0, 0, 1, 1}}, + }, + 1.0f + ), + TestParamType( + Mat3{ + {{0, 0, 1, 1}}, + {{1, 1, 0, 0}}, + }, + 1.0f + ), + TestParamType( + Mat3{ + {{0, 0, 0, 0}}, + {{0, 1, 2, 3}}, + }, + 0.0f + ) + ) +); diff --git a/tests/cuda_ext/test_ari_py.cpp b/tests/cuda_ext/test_ari_py.cpp new file mode 100644 index 00000000..346739a1 --- /dev/null +++ b/tests/cuda_ext/test_ari_py.cpp @@ -0,0 +1,41 @@ +#include +#include +#include // everything needed for embedding +#include +#include + +namespace py = pybind11; + +int main() { + py::scoped_interpreter guard{}; // start the interpreter and keep it alive + + try { + // Define vectors in C++ + std::vector part0 = {2, 3, 6, 1, 0, 5, 4, 3, 6, 2}; + std::vector part1 = {0, 6, 2, 5, 1, 3, 4, 6, 0, 2}; + + // Import required Python modules + py::module_ np = py::module_::import("numpy"); + py::module_ ccc_module = py::module_::import("ccc.sklearn.metrics"); + + // Convert C++ vectors to numpy arrays + py::array_t np_part0 = py::cast(part0); + py::array_t np_part1 = py::cast(part1); + + // Call the ccc function + py::object result = ccc_module.attr("adjusted_rand_index")(np_part0, np_part1); + + // Convert result to C++ double + const auto correlation = result.cast(); + + std::cout << "ARI: " << correlation << std::endl; + } + catch (const py::error_already_set& e) { + std::cerr << "Python error: " << e.what() << std::endl; + } + catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/tests/cuda_ext/test_ari_random.cpp b/tests/cuda_ext/test_ari_random.cpp new file mode 100644 index 00000000..ba74546e --- /dev/null +++ b/tests/cuda_ext/test_ari_random.cpp @@ -0,0 +1,343 @@ +/** + * @file test_ari_random.cpp + * @brief Test suite for Adjusted Rand Index (ARI) computation using CUDA + * + * This test suite validates the CUDA implementation of ARI computation against + * a reference Python implementation. It tests various input sizes and configurations + * using parameterized tests. + * + * The test compares results from: + * 1. CUDA implementation (ari_core) + * 2. Python reference implementation (ccc.sklearn.metrics.adjusted_rand_index) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../libs/ccc_cuda_ext/metrics.cuh" + +namespace py = pybind11; + +/** + * @brief Helper class for generating and manipulating test data + * + * This class provides static utility functions for: + * - Generating random partition data + * - Reshaping arrays between different dimensions + * - Generating pairwise combinations of partitions + */ +class TestDataGenerator { +public: + /** + * @brief Generates random partition assignments + * + * @param n_features Number of features + * @param n_parts Number of partitions per feature + * @param n_objs Number of objects + * @param k Number of possible cluster assignments + * @param seed Random seed for reproducibility + * @return std::vector Flattened array of random partition assignments + */ + static std::vector generate_random_partitions(int n_features, int n_parts, + int n_objs, int k, unsigned seed = 42) { + std::vector parts(n_features * n_parts * n_objs); + std::mt19937 gen(seed); + std::uniform_int_distribution<> dis(0, k - 1); + + for (auto& val : parts) { + val = dis(gen); + } + return parts; + } + + /** + * @brief Reshapes a flat array into a 3D structure + * + * @param flat_array Input array + * @param n_features Number of features + * @param n_parts Number of partitions per feature + * @param n_objs Number of objects + * @return 3D vector representing [features][parts][objects] + */ + static std::vector>> reshape_to_3d( + const std::vector& flat_array, + int n_features, int n_parts, int n_objs) { + + std::vector>> parts_3d( + n_features, std::vector>( + n_parts, std::vector(n_objs))); + + for (int f = 0; f < n_features; ++f) { + for (int p = 0; p < n_parts; ++p) { + for (int o = 0; o < n_objs; ++o) { + parts_3d[f][p][o] = flat_array[f * (n_parts * n_objs) + p * n_objs + o]; + } + } + } + return parts_3d; + } + + /** + * @brief Generates all pairwise combinations of partitions from different features + * + * Given a 3D array of shape [n_features, n_parts, n_objs], this function generates + * all possible pairs of partitions between different features. For example, if we have + * features f0, f1, f2, it will generate pairs between: + * - f0 and f1 partitions + * - f0 and f2 partitions + * - f1 and f2 partitions + * + * @param arr A 3D vector where: + * - First dimension (arr.size()) represents different features + * - Second dimension (arr[i].size()) represents different partitions for each feature + * - Third dimension (arr[i][j].size()) represents objects in each partition + * + * @return std::vector, std::vector>> + * A vector of partition pairs where each pair contains: + * - first: vector of partition labels from one feature + * - second: vector of partition labels from another feature + * + * @example + * // For a 3D array with shape [2, 2, 4]: + * arr = { + * {{0,1,2,3}, {4,5,6,7}}, // feature 0's partitions + * {{8,9,10,11}, {12,13,14,15}} // feature 1's partitions + * } + * // Will generate pairs: + * // ({0,1,2,3}, {8,9,10,11}) + * // ({0,1,2,3}, {12,13,14,15}) + * // ({4,5,6,7}, {8,9,10,11}) + * // ({4,5,6,7}, {12,13,14,15}) + */ + static std::vector, std::vector>> + generate_pairwise_combinations(const std::vector>>& arr) { + std::vector, std::vector>> pairs; + + // Generate indices for features + auto indices = std::views::iota(0u, arr.size()); + + // For each feature index + for (auto i : indices) { + // For each subsequent feature index (avoiding duplicate pairs) + for (auto j : std::views::iota(i + 1u, arr.size())) { + // For each partition in feature i + for (const auto& row_i : arr[i]) { + // For each partition in feature j + for (const auto& row_j : arr[j]) { + // Add the pair of partitions to our result + pairs.emplace_back(row_i, row_j); + } + } + } + } + return pairs; + } +}; + +/** + * @brief Parameters for ARI test cases + * + * Encapsulates the parameters that define a test case for ARI computation: + * - Number of features to compare + * - Number of partitions per feature + * - Number of objects in each partition + * - Number of possible cluster assignments + * - Tolerance for floating-point comparisons + */ +struct AriTestParams { + int n_features; + int n_parts; + int n_objs; + int k; + float tolerance; // Added tolerance as a parameter + + AriTestParams(int features, int parts, int objects, int clusters, float tol = 1e-5) + : n_features(features) + , n_parts(parts) + , n_objs(objects) + , k(clusters) + , tolerance(tol) {} + + // Add string representation for better test output + friend std::ostream& operator<<(std::ostream& os, const AriTestParams& params) { + return os << "Features=" << params.n_features + << ", Parts=" << params.n_parts + << ", Objects=" << params.n_objs + << ", Clusters=" << params.k; + } +}; + +/** + * @brief Test fixture for parameterized ARI tests + * + * This fixture provides: + * 1. Python environment setup and teardown + * 2. Reference implementation through Python + * 3. Result validation utilities + * + * The fixture ensures that: + * - Python interpreter is initialized once for all tests + * - Required Python modules are imported + * - Resources are properly cleaned up + */ +class PairwiseAriTest : public ::testing::TestWithParam { +protected: + /** + * @brief Set up Python environment before any tests run + * + * Initializes: + * - Python interpreter + * - NumPy module + * - CCC metrics module + */ + static void SetUpTestSuite() { + if (!guard) { + guard = std::make_unique(); + try { + np = std::make_unique(py::module_::import("numpy")); + ccc_module = std::make_unique(py::module_::import("ccc.sklearn.metrics")); + } catch (const std::exception& e) { + FAIL() << "Failed to initialize Python modules: " << e.what(); + } + } + } + + /** + * @brief Clean up Python environment after all tests complete + */ + static void TearDownTestSuite() { + ccc_module.reset(); + np.reset(); + guard.reset(); + } + + /** + * @brief Compute ARI using Python reference implementation + * + * @param labels1 First partition + * @param labels2 Second partition + * @return float ARI score + * @throws Logs failure if Python computation fails + */ + float compute_ari(const std::vector& labels1, const std::vector& labels2) { + try { + py::array_t np_part0 = py::cast(labels1); + py::array_t np_part1 = py::cast(labels2); + + py::object result = ccc_module->attr("adjusted_rand_index")(np_part0, np_part1); + return result.cast(); + } catch (const py::error_already_set& e) { + ADD_FAILURE() << "Python error: " << e.what(); + return 0.0f; + } catch (const std::exception& e) { + ADD_FAILURE() << "C++ error: " << e.what(); + return 0.0f; + } + } + + /** + * @brief Validate CUDA results against reference implementation + * + * @param actual Results from CUDA implementation + * @param expected Results from reference implementation + * @param tolerance Maximum allowed difference + */ + void validate_results(const std::vector& actual, + const std::vector& expected, + float tolerance) { + ASSERT_EQ(actual.size(), expected.size()) ; + // << "Mismatch in result sizes"; + + for (size_t i = 0; i < actual.size(); ++i) { + EXPECT_NEAR(actual[i], expected[i], tolerance); + // << "Mismatch at index " << i; + } + } + +private: + static std::unique_ptr guard; + static std::unique_ptr np; + static std::unique_ptr ccc_module; +}; + +// Static member definitions +std::unique_ptr PairwiseAriTest::guard; +std::unique_ptr PairwiseAriTest::np; +std::unique_ptr PairwiseAriTest::ccc_module; + +/** + * @brief Test case for random partition ARI computation + * + * This test: + * 1. Generates random partition data + * 2. Computes ARI using CUDA implementation + * 3. Computes reference results using Python + * 4. Validates CUDA results against reference + * + * @param GetParam() Test parameters defining input size and configuration + */ +TEST_P(PairwiseAriTest, RandomPartitions) { + const auto params = GetParam(); + + // Generate test data + auto parts = TestDataGenerator::generate_random_partitions( + params.n_features, params.n_parts, params.n_objs, params.k); + + // Get CUDA results + auto res_aris = ari_core(parts.data(), + params.n_features, params.n_parts, params.n_objs); + + // Generate reference results + auto parts_3d = TestDataGenerator::reshape_to_3d( + parts, params.n_features, params.n_parts, params.n_objs); + auto pairs = TestDataGenerator::generate_pairwise_combinations(parts_3d); + + std::vector ref_aris; + ref_aris.reserve(pairs.size()); + + for (const auto& [part0, part1] : pairs) { + ref_aris.push_back(compute_ari(part0, part1)); + } + + // Validate results + validate_results(res_aris, ref_aris, params.tolerance); +} + +/** + * @brief Test suite instantiation with various parameter sets + * + * Current test cases: + * - Small input (2 features, 2 parts, 100 objects) + * - Medium input (5 features, 10 parts, 200 objects) + * + * Known issues: + * - Wrong results with large inputs (100 features) + * - Memory access issues with very large inputs + * - GPU memory limitations with extreme inputs + */ +INSTANTIATE_TEST_SUITE_P( + PairwiseAriTestInstances, + PairwiseAriTest, + ::testing::Values( + AriTestParams(2, 2, 100, 10), + AriTestParams(5, 10, 200, 10), + // AriTestParams(2, 1, 1000, 10), // FIXME: wrong results, maybe test is not correct + AriTestParams(100, 20, 100, 10) + // Document known issues + // AriTestParams(100, 20, 1000, 10), // FIXME: wrong results, maybe test is not correct + // AriTestParams(200, 20, 300, 10), // slow to run as a unit test + // AriTestParams(1000, 10, 300, 10) // slow to run as a unit test + ), + // Add test name generator for better output + [](const testing::TestParamInfo& info) { + return std::string("Features") + std::to_string(info.param.n_features) + + "_Parts" + std::to_string(info.param.n_parts) + + "_Objects" + std::to_string(info.param.n_objs); + } +); diff --git a/tests/gpu/excluded/test_coef_subroutines.py b/tests/gpu/excluded/test_coef_subroutines.py new file mode 100644 index 00000000..82bfe8bb --- /dev/null +++ b/tests/gpu/excluded/test_coef_subroutines.py @@ -0,0 +1,372 @@ +# This test file is used to verify the correctness of the GPU version of subroutine functions +# Now we fall back to the original CPU implementation of ccc, so this test file is not used for now + +# import pytest +# +# import numpy as np +# import cupy as cp +# from numpy.testing import assert_array_equal, assert_allclose +# +# from ccc.coef.impl_gpu import ( +# get_perc_from_k, +# get_range_n_percentages, +# convert_n_clusters, +# get_range_n_clusters, +# get_parts, +# ) +# +# +# def test_get_perc_from_k_with_k_less_than_two(): +# empty_array = np.empty(0) +# assert_array_equal(get_perc_from_k(1), empty_array) +# assert_array_equal(get_perc_from_k(0), empty_array) +# assert_array_equal(get_perc_from_k(-1), empty_array) +# +# +# @pytest.mark.parametrize("k, expected", [ +# (2, [0.5]), +# (3, [0.333, 0.667]), +# (4, [0.25, 0.50, 0.75]) +# ]) +# def test_get_perc_from_k(k, expected): +# assert_allclose(np.ndarray.round(get_perc_from_k(k), 3), expected) +# +# +# @pytest.mark.parametrize( +# "ks, expected", +# [ +# ( +# np.array([], dtype=np.int8), +# np.empty((0, 0), dtype=np.float32) +# ), +# ( +# np.array([2, 3, 4], dtype=np.int8), +# np.array([ +# [0.5, np.nan, np.nan], +# [0.33333334, 0.6666667, np.nan], +# [0.25, 0.5, 0.75] +# ], dtype=np.float32) +# ), +# ( +# np.array([2], dtype=np.int8), +# np.array([[0.5]], dtype=np.float32) +# ), +# ( +# np.array([10], dtype=np.int8), +# np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]], dtype=np.float32) +# ), +# ( +# np.array([2, 4, 6, 8], dtype=np.int8), +# np.array([ +# [0.5, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], +# [0.25, 0.5, 0.75, np.nan, np.nan, np.nan, np.nan], +# [0.16666667, 0.33333334, 0.5, 0.6666667, 0.8333333, np.nan, np.nan], +# [0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875] +# ], dtype=np.float32) +# ), +# ( +# np.array([2, 3, 4], dtype=np.int8), +# np.array([ +# [0.5, np.nan, np.nan], +# [0.33333334, 0.6666667, np.nan], +# [0.25, 0.5, 0.75], +# ], dtype=np.float32) +# ), +# ] +# ) +# def test_get_range_n_percs(ks, expected): +# result = get_range_n_percentages(ks) +# np.testing.assert_array_almost_equal(result, expected) +# +# +# @pytest.mark.parametrize( +# "ks, expected_frac, expected_perc", +# [ +# ( +# np.array([], dtype=np.int8), +# np.empty((0, 0), dtype=np.float32), +# np.empty((0, 0), dtype=np.float32) +# ), +# ( +# np.array([2, 3, 4], dtype=np.int8), +# np.array([ +# [0.5, np.nan, np.nan], +# [0.33333334, 0.6666667, np.nan], +# [0.25, 0.5, 0.75] +# ], dtype=np.float32), +# np.array([ +# [50, np.nan, np.nan], +# [33, 67, np.nan], +# [25, 50, 75] +# ], dtype=np.float32) +# ), +# ( +# np.array([2], dtype=np.int8), +# np.array([[0.5]], dtype=np.float32), +# np.array([[50]], dtype=np.float32) +# ), +# ( +# np.array([10], dtype=np.int8), +# np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]], dtype=np.float32), +# np.array([[10, 20, 30, 40, 50, 60, 70, 80, 90]], dtype=np.float32) +# ), +# ( +# np.array([2, 4, 6, 8], dtype=np.int8), +# np.array([ +# [0.5, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], +# [0.25, 0.5, 0.75, np.nan, np.nan, np.nan, np.nan], +# [0.16666667, 0.33333334, 0.5, 0.6666667, 0.8333333, np.nan, np.nan], +# [0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875] +# ], dtype=np.float32), +# np.array([ +# [50, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], +# [25, 50, 75, np.nan, np.nan, np.nan, np.nan], +# [17, 33, 50, 67, 83, np.nan, np.nan], +# [12, 25, 38, 50, 62, 75, 88] +# ], dtype=np.float32) +# ), +# ] +# ) +# def test_get_range_n_percs_as_percentage(ks, expected_frac, expected_perc): +# # Test fractional percentiles (original behavior) +# result_frac = get_range_n_percentages(ks, as_percentage=False) +# np.testing.assert_array_almost_equal(result_frac, expected_frac) +# +# # Test percentage numbers +# result_perc = get_range_n_percentages(ks, as_percentage=True) +# np.testing.assert_array_almost_equal(result_perc, expected_perc) +# +# +# @pytest.mark.parametrize( +# "input_value, expected_output", +# [ +# (None, []), +# (2, [2]), +# (5, [2, 3, 4, 5]), +# ([1, 3, 5], [1, 3, 5]), +# ([], []), +# ((7, 8, 9), [7, 8, 9]), +# ] +# ) +# def test_convert_n_clusters(input_value, expected_output): +# assert convert_n_clusters(input_value) == expected_output +# +# +# def test_get_range_n_clusters_without_internal_n_clusters(): +# # 100 features +# range_n_clusters = get_range_n_clusters(100) +# assert range_n_clusters is not None +# np.testing.assert_array_equal( +# range_n_clusters, np.array([2, 3, 4, 5, 6, 7, 8, 9, 10]) +# ) +# +# # 25 features +# range_n_clusters = get_range_n_clusters(25) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2, 3, 4, 5])) +# +# +# def test_get_range_n_clusters_with_internal_n_clusters_is_list(): +# # 100 features +# range_n_clusters = get_range_n_clusters( +# 100, +# internal_n_clusters=[2], +# ) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2])) +# +# # 25 features +# range_n_clusters = get_range_n_clusters( +# 25, +# internal_n_clusters=[2], +# ) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2])) +# +# # 25 features +# range_n_clusters = get_range_n_clusters(25, internal_n_clusters=[2, 3, 4]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2, 3, 4])) +# +# +# def test_get_range_n_clusters_with_internal_n_clusters_none(): +# # 100 features +# range_n_clusters = get_range_n_clusters(100, internal_n_clusters=None) +# assert range_n_clusters is not None +# np.testing.assert_array_equal( +# range_n_clusters, np.array([2, 3, 4, 5, 6, 7, 8, 9, 10]) +# ) +# +# # 25 features +# range_n_clusters = get_range_n_clusters(25, internal_n_clusters=None) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2, 3, 4, 5])) +# +# +# def test_get_range_n_clusters_with_internal_n_clusters_has_single_int(): +# # 100 features +# range_n_clusters = get_range_n_clusters(100, internal_n_clusters=[2]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2])) +# +# # 25 features +# range_n_clusters = get_range_n_clusters(25, internal_n_clusters=[3]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([3])) +# +# # 5 features +# range_n_clusters = get_range_n_clusters(5, internal_n_clusters=[4]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([4])) +# +# # 25 features but invalid number of clusters +# range_n_clusters = get_range_n_clusters(25, internal_n_clusters=[1]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([])) +# +# # 25 features but invalid number of clusters +# range_n_clusters = get_range_n_clusters(25, internal_n_clusters=[25]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([])) +# +# +# def test_get_range_n_clusters_with_internal_n_clusters_are_less_than_two(): +# # 100 features +# range_n_clusters = get_range_n_clusters(100, internal_n_clusters=[1, 2, 3, 4]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2, 3, 4])) +# +# range_n_clusters = get_range_n_clusters(100, internal_n_clusters=[1, 2, 1, 4]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2, 4])) +# +# range_n_clusters = get_range_n_clusters(100, internal_n_clusters=[1, 2, 3, 1]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2, 3])) +# +# range_n_clusters = get_range_n_clusters(100, internal_n_clusters=[1, 2, 0, 4]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2, 4])) +# +# range_n_clusters = get_range_n_clusters(100, internal_n_clusters=[1, 2, 1, -4, 6]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2, 6])) +# +# +# def test_get_range_n_clusters_with_internal_n_clusters_are_repeated(): +# # 100 features +# range_n_clusters = get_range_n_clusters(100, internal_n_clusters=[2, 3, 2, 4]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2, 3, 4])) +# +# range_n_clusters = get_range_n_clusters(100, internal_n_clusters=[2, 2, 2]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2])) +# +# +# def test_get_range_n_clusters_with_very_few_features(): +# # 3 features +# range_n_clusters = get_range_n_clusters(3) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([2])) +# +# # 2 features +# range_n_clusters = get_range_n_clusters(2) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([])) +# +# # 1 features +# range_n_clusters = get_range_n_clusters(1) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([])) +# +# # 0 features +# range_n_clusters = get_range_n_clusters(0) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([])) +# +# +# def test_get_range_n_clusters_with_larger_k_than_features(): +# # 10 features +# range_n_clusters = get_range_n_clusters(10, internal_n_clusters=[10]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([])) +# +# # 10 features +# range_n_clusters = get_range_n_clusters(10, internal_n_clusters=[11]) +# assert range_n_clusters is not None +# np.testing.assert_array_equal(range_n_clusters, np.array([])) +# +# +# def test_get_range_n_clusters_with_default_max_k(): +# range_n_clusters = get_range_n_clusters(200) +# assert range_n_clusters is not None +# np.testing.assert_array_equal( +# range_n_clusters, np.array([2, 3, 4, 5, 6, 7, 8, 9, 10]) +# ) +# +# # get_parts +# def test_get_parts_simple(): +# np.random.seed(0) +# +# # Test with 2 clusters +# features0 = np.random.rand(100) +# parts = get_parts(features0, np.array([2], dtype=np.uint8)).get() +# assert parts is not None +# assert len(parts) == 1, "should have only one feature" +# assert len(parts[0]) == 1, "should have only one partition" +# assert len(np.unique(parts[0])) == 2, "should have 2 cluster indexes" +# +# # Test with [2, 3] clusters +# parts = get_parts(features0, np.array([2, 3], dtype=np.uint8)).get() +# assert parts is not None +# assert len(parts) == 1 +# assert len(parts[0]) == 2, "feature should have 2 clusters" +# assert len(np.unique(parts[0][0])) == 2 +# assert len(np.unique(parts[0][1])) == 3 +# +# +# def test_get_parts_with_singletons(): +# np.random.seed(0) +# +# feature0 = np.array([1.3] * 10) +# +# # run +# parts = get_parts(feature0, np.array([2], dtype=np.uint8)).get() +# assert parts is not None +# assert len(parts) == 1 +# assert len(parts[0]) == 1 +# # all the elements (2D) should be -2 +# np.testing.assert_array_equal(np.unique(parts[0]), np.array([-2])) +# +# parts = get_parts(feature0, np.array([2, 3], dtype=np.uint8)).get() +# assert parts is not None +# assert len(parts) == 1 +# assert len(parts[0]) == 2, "feature should have 2 clusters" +# np.testing.assert_array_equal(np.unique(parts[0][0]), np.array([-2])) +# np.testing.assert_array_equal(np.unique(parts[0][1]), np.array([-2])) +# +# +# def test_get_parts_with_categorical_feature(): +# mempool = cp.get_default_memory_pool() +# mempool.free_all_blocks() +# +# np.random.seed(0) +# +# feature0 = np.array([4] * 10) +# +# # run +# # only one partition is requested +# parts = get_parts(feature0, np.array([2], dtype=np.uint8), data_is_numerical=False).get() +# assert parts is not None +# assert len(parts) == 1 +# assert len(parts[0]) == 1 +# np.testing.assert_array_equal(np.unique(parts[0]), np.array([4])) +# +# # more partitions are requested; only the first two has valid information +# parts = get_parts(feature0, np.array([2, 3], dtype=np.uint8), data_is_numerical=False).get() +# assert parts is not None +# assert len(parts) == 1 +# assert len(parts[0]) == 2 +# np.testing.assert_array_equal(np.unique(parts[0][0]), np.array([4])) +# np.testing.assert_array_equal(np.unique(parts[0][1]), np.array([-1])) diff --git a/tests/gpu/excluded/test_cuml.py b/tests/gpu/excluded/test_cuml.py new file mode 100644 index 00000000..001a1fa2 --- /dev/null +++ b/tests/gpu/excluded/test_cuml.py @@ -0,0 +1,100 @@ +import cupy as cp +import numpy as np +import pytest +from cuml.metrics import adjusted_rand_score +from sklearn.metrics import adjusted_rand_score +from cuml.common import CumlArray +from cuml.internals.memory_utils import using_output_type +from cuml.internals.safe_imports import gpu_only_import +import time +from pylibraft.common import Stream, DeviceResources + + +def generate_random_labels(size, n_classes): + return cp.random.randint(0, n_classes, size=size, dtype=cp.int32) + + +def compute_ari_with_stream(handle, labels1, labels2): + with using_output_type("cupy"): + return adjusted_rand_score(labels1, labels2, handle=handle) + + +def test_stream(): + n_samples = 10000 + n_classes = 5 + n_iterations = 100 + + cupy_stream = cp.cuda.Stream() + # Create a RAFT handle + handle = DeviceResources(stream=cupy_stream.ptr) + + # Generate random labels + labels1 = [generate_random_labels(n_samples, n_classes) for _ in range(n_iterations)] + labels2 = [generate_random_labels(n_samples, n_classes) for _ in range(n_iterations)] + + # Create CUDA streams + n_streams = 4 # You can adjust this number based on your GPU + streams = [cp.cuda.Stream() for _ in range(n_streams)] + + results = [] + start_time = time.time() + + # for i in range(n_iterations): + # stream = streams[i % n_streams] + # with stream: + # handle.set_stream(stream.ptr) + # ari = compute_ari_with_stream(handle, labels1[i], labels2[i]) + # results.append(ari) + # + # # Synchronize all streams + # for stream in streams: + # stream.synchronize() + + ari = compute_ari_with_stream(handle, labels1[0], labels2[0]) + results.append(ari) + + end_time = time.time() + + # Print results + print(f"Computed {n_iterations} ARI scores") + print(f"Time taken: {end_time - start_time:.4f} seconds") + print(results) + + +def generate_data(size): + np.random.seed(42) + labels_true = np.random.randint(0, 10, size=size) + labels_pred = np.random.randint(0, 10, size=size) + return labels_true, labels_pred + + +def time_function(func, *args): + start_time = time.time() + result = func(*args) + end_time = time.time() + return result, end_time - start_time + + +@pytest.mark.parametrize("size", [1000, 10000, 100000, 1000000]) +def test_adjusted_rand_score_speedup(size): + from cuml.metrics import adjusted_rand_score as cuml_ari + from sklearn.metrics import adjusted_rand_score as sklearn_ari + labels_true, labels_pred = generate_data(size) + + # Sklearn (CPU) implementation + _, sklearn_time = time_function(sklearn_ari, labels_true, labels_pred) + + # cuML (GPU) implementation + labels_true_gpu = cp.asarray(labels_true) + labels_pred_gpu = cp.asarray(labels_pred) + _, cuml_time = time_function(cuml_ari, labels_true_gpu, labels_pred_gpu) + + speedup = sklearn_time / cuml_time + + print(f"\nData size: {size}") + print(f"Sklearn time: {sklearn_time:.6f} seconds") + print(f"cuML time: {cuml_time:.6f} seconds") + print(f"Speedup: {speedup:.2f}x") + + # assert speedup > 1, f"cuML should be faster than sklearn, but speedup was only {speedup:.2f}x" + diff --git a/tests/gpu/excluded/test_cuml_in_kernel.py b/tests/gpu/excluded/test_cuml_in_kernel.py new file mode 100644 index 00000000..ac1f5714 --- /dev/null +++ b/tests/gpu/excluded/test_cuml_in_kernel.py @@ -0,0 +1,153 @@ +import cupy as cp +import numpy as np +from cupyx.jit import rawkernel +from cuml.metrics import adjusted_rand_score as cu_rnd_sc +from ccc.sklearn.metrics import adjusted_rand_index as ari +from numpy.typing import NDArray + +# Assuming cu_rnd_sc is already defined as a device function +# If not, you'll need to implement it as a CUDA device function + + +@rawkernel() +def ari_kernel(x, y, res, m_x, m_y, n): + i = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x + if i < m_x * m_y: + row_x = i // m_y + row_y = i % m_y + if x[row_x, 0] >= 0 and y[row_y, 0] >= 0: + res[i] = cu_rnd_sc(x[row_x], y[row_y], n) + else: + res[i] = 0.0 + + +def cdist_parts_cuda(x: cp.ndarray, y: cp.ndarray) -> cp.ndarray: + """ + CUDA-accelerated version of cdist_parts_basic using CuPy. + Each CUDA thread compares one row of x with one row of y. + + Args: + x: a 2d array with m_x clustering partitions in rows and n objects in columns. + y: a 2d array with m_y clustering partitions in rows and n objects in columns. + + Returns: + A 2d array with m_x rows and m_y columns and the ARI between each partition pair. + """ + m_x, n = x.shape + m_y, _ = y.shape + res = cp.zeros(m_x * m_y, dtype=cp.float32) + + threads_per_block = 256 + blocks = (m_x * m_y + threads_per_block - 1) // threads_per_block + + ari_kernel[blocks, threads_per_block](x, y, res, m_x, m_y, n) + + return res.reshape(m_x, m_y) + + +def cdist_parts_basic(x: NDArray, y: NDArray) -> NDArray[float]: + """ + It implements the same functionality in scipy.spatial.distance.cdist but + for clustering partitions, and instead of a distance it returns the adjusted + Rand index (ARI). In other words, it mimics this function call: + + cdist(x, y, metric=ari) + + Only partitions with positive labels (> 0) are compared. This means that + partitions marked as "singleton" or "empty" (categorical data) are not + compared. This has the effect of leaving an ARI of 0.0 (zero). + + Args: + x: a 2d array with m_x clustering partitions in rows and n objects in + columns. + y: a 2d array with m_y clustering partitions in rows and n objects in + columns. + + Returns: + A 2d array with m_x rows and m_y columns and the ARI between each + partition pair. Each ij entry is equal to ari(x[i], y[j]) for each i + and j. + """ + res = np.zeros((x.shape[0], y.shape[0])) + + for i in range(res.shape[0]): + if x[i, 0] < 0: + continue + + for j in range(res.shape[1]): + if y[j, 0] < 0: + continue + + res[i, j] = ari(x[i], y[j]) + + return res + + +# Test function +def test_cdist_parts_cuda(): + # Generate sample data + np.random.seed(0) + m_x, m_y, n = 100, 80, 1000 + x = np.random.randint(0, 5, size=(m_x, n)) + y = np.random.randint(0, 5, size=(m_y, n)) + + # Convert to CuPy arrays + x_gpu = cp.asarray(x) + y_gpu = cp.asarray(y) + + # Run CUDA version + res_cuda = cdist_parts_cuda(x_gpu, y_gpu) + + # Run CPU version for comparison + res_cpu = cdist_parts_basic(x, y) + + # Compare results + cp.cuda.Stream.null.synchronize() + res_cuda_np = cp.asnumpy(res_cuda) + + assert np.allclose(res_cuda_np, res_cpu, atol=1e-6), "CUDA and CPU results do not match" + + print("CUDA implementation matches CPU implementation") + + # Performance comparison + import time + + start_time = time.time() + for _ in range(10): + cdist_parts_cuda(x_gpu, y_gpu) + cp.cuda.Stream.null.synchronize() + cuda_time = (time.time() - start_time) / 10 + + start_time = time.time() + for _ in range(10): + cdist_parts_basic(x, y) + cpu_time = (time.time() - start_time) / 10 + + print(f"CUDA time: {cuda_time:.6f} seconds") + print(f"CPU time: {cpu_time:.6f} seconds") + print(f"Speedup: {cpu_time / cuda_time:.2f}x") + + +from cupyx import jit + + +@jit.rawkernel() +def elementwise_copy(x, y, size): + tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x + ntid = jit.gridDim.x * jit.blockDim.x + for i in range(tid, size, ntid): + y[i] = x[i] + + +def test_elementwise(): + size = cp.uint32(2 ** 22) + x = cp.random.normal(size=(size,), dtype=cp.float32) + y = cp.empty((size,), dtype=cp.float32) + + elementwise_copy((128,), (1024,), (x, y, size)) # RawKernel style + + + assert (x == y).all() + + elementwise_copy[128, 1024](x, y, size) # Numba style + assert (x == y).all() \ No newline at end of file diff --git a/tests/gpu/excluded/test_cupy.py b/tests/gpu/excluded/test_cupy.py new file mode 100644 index 00000000..b36020d5 --- /dev/null +++ b/tests/gpu/excluded/test_cupy.py @@ -0,0 +1,500 @@ +import cupy as cp +import numpy as np +import matplotlib.pyplot as plt +import pytest +import re + +from ccc.sklearn.metrics import get_contingency_matrix + + +def test_raw_kernel(): + # Define a raw kernel + kernel = cp.RawKernel(r''' + extern "C" __global__ + void my_raw_kernel(float* x, float* y, int n) { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid < n) { + y[tid] = x[tid] * x[tid]; + } + } + ''', 'my_raw_kernel') + + # Prepare input data + n = 10 + x = cp.arange(n, dtype=cp.float32) + + # Allocate output array + y = cp.empty_like(x) + + # Launch the kernel + kernel((n,), (1,), (x, y, n)) + + # Check the result + assert cp.all(y == x * x) + + +def test_raw_kernel_with_thrust(): + N = 100 + code = """ + #include + #include + extern "C" __global__ + void xyzw_frequency_thrust_device(int *count, char *text, int n) + { + const char letters[] { 'x','y','z','w' }; + + *count = thrust::count_if(thrust::device, text, text+n, [=](char c) { + for (const auto x : letters) + if (c == x) return true; + return false; + }); + }""" + kernel = cp.RawModule(code=code, backend='nvcc') + code = kernel.get_function("xyzw_frequency_thrust_device") + + in_str = 'xxxzzzwwax' + count = cp.zeros([1], dtype=cp.int64) + in_arr = cp.array([ord(x) for x in in_str], dtype=cp.int8) + + # count[0] == 9 Define a raw kernel + code(grid=(N,),block=(N,), args=(count, in_arr, len(in_str))) + print() + print(count) + + +def test_thrust_unique_count(): + N = 100 + code = """ + #include + #include + extern "C" __global__ + void unique_count_thrust_device(int *count, int *data, int n) + { + *count = thrust::unique_count(thrust::device, data, data + n), thrust::equal_to(); + }""" + kernel = cp.RawModule(code=code, backend='nvcc') + code = kernel.get_function("unique_count_thrust_device") + + # in_arr = cp.random.randint(0, 10, N) + in_arr = cp.asarray([1, 3, 3, 3, 2, 2, 1], dtype=cp.int32) + count = cp.zeros([1], dtype=cp.int32) + + # count[0] == 9 Define a raw kernel + code(grid=(1,), block=(1,), args=(count, in_arr, 7)) + print(count) + + +def test_3d_raw_kernel(): + # Define a raw kernel to increment all elements by 1 + kernel = cp.RawKernel(r''' + extern "C" __global__ + void increment_3d(float* array, int x, int y, int z) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int idy = blockIdx.y * blockDim.y + threadIdx.y; + int idz = blockIdx.z * blockDim.z + threadIdx.z; + + if (idx < x && idy < y && idz < z) { + int index = idz * y * x + idy * x + idx; + array[index] += 1.0f; + } + } + ''', 'increment_3d') + + # Define the shape of the 3D array + shape = (64, 64, 64) + + # Allocate and initialize a 3D array on the device + d_array = cp.zeros(shape, dtype=cp.float32) + + # Define grid and block dimensions + block_dim = (8, 8, 8) + grid_dim = ( + (shape[0] + block_dim[0] - 1) // block_dim[0], + (shape[1] + block_dim[1] - 1) // block_dim[1], + (shape[2] + block_dim[2] - 1) // block_dim[2] + ) + + # Launch the kernel + kernel(grid_dim, block_dim, (d_array, shape[0], shape[1], shape[2])) + + # Copy the result back to CPU for verification + h_result = cp.asnumpy(d_array) + + # Verify the result + expected = np.ones(shape, dtype=np.float32) + np.testing.assert_array_almost_equal(h_result, expected, decimal=6) + + print("Test passed successfully!") + + +def test_3d_raw_kernel_1d_grid(): + # Define a raw kernel to increment all elements by 1 using 1D grid and block + kernel = cp.RawKernel(r''' + extern "C" __global__ + void increment_3d_1d(float* array, int x, int y, int z) { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + int total_size = x * y * z; + + if (tid < total_size) { + int idz = tid / (x * y); + int idy = (tid % (x * y)) / x; + int idx = tid % x; + + array[tid] += 1.0f; + } + } + ''', 'increment_3d_1d') + + # Define the shape of the 3D array + shape = (64, 64, 64) + + # Allocate and initialize a 3D array on the device + d_array = cp.zeros(shape, dtype=cp.float32) + + # Calculate total number of elements + total_elements = np.prod(shape) + + # Define 1D grid and block dimensions + block_dim = (256,) + grid_dim = ((total_elements + block_dim[0] - 1) // block_dim[0],) + + # Launch the kernel + kernel(grid_dim, block_dim, (d_array, shape[0], shape[1], shape[2])) + + # Copy the result back to CPU for verification + h_result = cp.asnumpy(d_array) + + # Verify the result + expected = np.ones(shape, dtype=np.float32) + np.testing.assert_array_almost_equal(h_result, expected, decimal=6) + + print("Test passed successfully!") + + +def test_ravle(): + from sklearn.metrics import confusion_matrix + y_true = [2, 0, 2, 2, 0, 1] + y_pred = [0, 0, 2, 2, 0, 2] + mat = confusion_matrix(y_true, y_pred) + print(mat) + + +def test_3d_raw_kernel_grid_stride(): + # Define a raw kernel to increment all elements by 1 using grid-stride pattern + kernel = cp.RawKernel(r''' + extern "C" __global__ + void increment_3d_grid_stride(float* array, int total_size) { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + + for (int i = tid; i < total_size; i += blockDim.x * gridDim.x) { + // Memory layout: CuPy, like NumPy, stores multi-dimensional arrays in contiguous memory + // in row-major order (C-style). This means that elements are laid out sequentially in memory, + // regardless of the array's shape. + array[i] += 1.0f; + } + } + ''', 'increment_3d_grid_stride') + + # Define the shape of the 3D array + shape = (64, 64, 64) + + # Allocate and initialize a 3D array on the device + d_array = cp.zeros(shape, dtype=cp.float32) + + # Calculate total number of elements + total_elements = np.prod(shape) + + # Define 1D grid and block dimensions + block_dim = (256,) + grid_dim = (min(1024, (total_elements + block_dim[0] - 1) // block_dim[0]),) + + # Launch the kernel + kernel(grid_dim, block_dim, (d_array, total_elements)) + + # Copy the result back to CPU for verification + h_result = cp.asnumpy(d_array) + + # Verify the result + expected = np.ones(shape, dtype=np.float32) + np.testing.assert_array_almost_equal(h_result, expected, decimal=6) + + print("Test passed successfully!") + + +def test_3d_raw_kernel_grid_stride_indexing(): + # Define a raw kernel to increment all elements by 1 using grid-stride pattern + # and explicit 3D indexing + kernel = cp.RawKernel(r''' + extern "C" __global__ + void increment_3d_grid_stride(float* array, int x, int y, int z) { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + int total_size = x * y * z; + + for (int i = tid; i < total_size; i += blockDim.x * gridDim.x) { + int iz = i / (x * y); + int iy = (i % (x * y)) / x; + int ix = i % x; + + // Accessing the 3D array using 3D indices + array[iz * (x * y) + iy * x + ix] += 1.0f; + } + } + ''', 'increment_3d_grid_stride') + + # Define the shape of the 3D array + shape = (64, 64, 64) + + # Allocate and initialize a 3D array on the device + d_array = cp.zeros(shape, dtype=cp.float32) + + # Calculate total number of elements + total_elements = np.prod(shape) + + # Define 1D grid and block dimensions + block_dim = (256,) + grid_dim = (min(1024, (total_elements + block_dim[0] - 1) // block_dim[0]),) + + # Launch the kernel + kernel(grid_dim, block_dim, (d_array, shape[0], shape[1], shape[2])) + + # Copy the result back to CPU for verification + h_result = cp.asnumpy(d_array) + + # Verify the result + expected = np.ones(shape, dtype=np.float32) + np.testing.assert_array_almost_equal(h_result, expected, decimal=6) + + print("Test passed successfully!") + + +def test_raft_api(): + code = cp.RawKernel(r''' + extern "C" __global__ + #include + #include + #include + #include + + raft::handle_t handle; + + int n_samples = 5000; + int n_features = 50; + + auto input = raft::make_device_matrix(handle, n_samples, n_features); + auto labels = raft::make_device_vector(handle, n_samples); + auto output = raft::make_device_matrix(handle, n_samples, n_samples); + + raft::random::make_blobs(handle, input.view(), labels.view()); + + auto metric = raft::distance::DistanceType::L2SqrtExpanded; + raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric); + ''', 'raft_test') + + +def test_pair_wise_reduction(): + # Define a 3D parts array + h_parts = np.array([ + [ + [1, 2, 3], + [0, 2, 2], + [1, 3, 3], + ], + [ + [1, 1, 1], + [3, 1, 2], + [1, 3, 3], + ], + [ + [0, 0, 3], + [2, 1, 2], + [1, 0, 1], + ], + ]) + # Host loop + n_features = h_parts.shape[0] + n_parts = h_parts.shape[1] + n_objs = h_parts.shape[2] + + n_feat_comp = n_features * (n_features - 1) // 2 + + +def test_cub_block_sort_kernel(): + kernel_code = r''' + /* + These constants can be dynamically manipulated using string formatting, providing a hack to non-type + template parameters in CUDA kernels using cupy + */ + + /* Headers */ + #include + #define BLOCK_THREADS {BLOCK_THREADS} + #define ITERM_PER_THREAD {ITERM_PER_THREAD} + + // Todo: research on how to compile these non-type template parameters using cupy + // template + extern "C" __global__ + void BlockSortKernel(int *d_in, int *d_out) + { + // extern __shared__ int tmp[]; + // tmp[threadIdx.x] = 1; + using BlockLoadT = cub::BlockLoad< + int, BLOCK_THREADS, ITERM_PER_THREAD, cub::BLOCK_LOAD_TRANSPOSE>; + using BlockStoreT = cub::BlockStore< + int, BLOCK_THREADS, ITERM_PER_THREAD, cub::BLOCK_STORE_TRANSPOSE>; + using BlockRadixSortT = cub::BlockRadixSort< + int, BLOCK_THREADS, ITERM_PER_THREAD>; + + __shared__ union { + typename BlockLoadT::TempStorage load; + typename BlockStoreT::TempStorage store; + typename BlockRadixSortT::TempStorage sort; + } temp_storage; + + int thread_keys[ITERM_PER_THREAD]; + int block_offset = blockIdx.x * (BLOCK_THREADS * ITERM_PER_THREAD); + BlockLoadT(temp_storage.load).Load(d_in + block_offset, thread_keys); + + __syncthreads(); + + BlockRadixSortT(temp_storage.sort).Sort(thread_keys); + + __syncthreads(); + + BlockStoreT(temp_storage.store).Store(d_out + block_offset, thread_keys); + } + + /* + extern "C" __global__ + void launch_block_sort_kernel(int *d_in, int *d_out, int num_items) + { + const int BLOCK_THREADS = 128; + const int ITEMS_PER_THREAD = 4; + const int BLOCK_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; + + int grid_size = (num_items + BLOCK_ITEMS - 1) / BLOCK_ITEMS; + BlockSortKernel<<>>(d_in, d_out); + } + */ + ''' + + + # Set up test parameters + num_items = 1024 # Must be a multiple of BLOCK_ITEMS (128 * 4 = 512 in this case) + + # Generate random input data + np_input = np.random.randint(0, 1000, num_items, dtype=np.int32) + d_input = cp.asarray(np_input) + d_output = cp.empty_like(d_input) + + # Configure the kernel + block_threads = 128 + items_per_thread = 4 + block_items = block_threads * items_per_thread + grid_size = (num_items + block_items - 1) // block_items + # Format the kernel string + kernel_code = re.sub(r'\{BLOCK_THREADS\}', str(block_threads), kernel_code) + kernel_code = re.sub(r'\{ITERM_PER_THREAD\}', str(items_per_thread), kernel_code) + # Compile the CUDA kernel + module = cp.RawModule(code=kernel_code, backend='nvcc') + kernel = module.get_function('BlockSortKernel') + + kernel((grid_size,), (block_threads,), (d_input, d_output, 4), shared_mem=block_threads * 4 * 4) + + # Get the results back to host + cp_output = cp.asnumpy(d_output) + + # Verify the results + np_sorted = np.sort(np_input) + + # Check if each block is sorted + block_size = 512 # BLOCK_THREADS * ITEMS_PER_THREAD + for i in range(0, num_items, block_size): + block_end = min(i + block_size, num_items) + assert np.all(np.diff(cp_output[i:block_end]) >= 0), f"Block starting at index {i} is not sorted" + + print("All blocks are correctly sorted!") + + # Optional: Check if the entire array is sorted (it won't be, as we're only sorting within blocks) + # assert np.array_equal(cp_output, np_sorted), "The entire array is not globally sorted" + + +def contingency_matrix_cuda(part0, part1, k0, k1): + # CUDA kernel as a string + cuda_kernel = r""" + extern "C" __global__ void contingency_matrix_kernel( + const int* part0, + const int* part1, + int* cont_mat, + int n, + int k0, + int k1 + ) { + extern __shared__ int shared_mem[]; + int* shared_part0 = shared_mem; + int* shared_part1 = &shared_mem[blockDim.x]; + int tid = threadIdx.x; + int bid = blockIdx.x; + int gid = bid * blockDim.x + tid; + // Load data into shared memory + if (gid < n) { + shared_part0[tid] = part0[gid]; + shared_part1[tid] = part1[gid]; + } + __syncthreads(); + // Compute contingency matrix + for (int i = tid; i < k0 * k1; i += blockDim.x) { + int row = i / k1; + int col = i % k1; + int count = 0; + for (int j = 0; j < blockDim.x && j < n; ++j) { + if (shared_part0[j] == row && shared_part1[j] == col) { + count++; + } + } + atomicAdd(&cont_mat[row * k1 + col], count); + } + } + """ + + # Compile the CUDA kernel + module = cp.RawModule(code=cuda_kernel) + kernel = module.get_function("contingency_matrix_kernel") + + n = len(part0) + d_part0 = cp.asarray(part0) + d_part1 = cp.asarray(part1) + d_cont_mat = cp.zeros((k0, k1), dtype=np.int32) + + block_size = 256 + grid_size = (n + block_size - 1) // block_size + shared_mem_size = 2 * block_size * 4 # 4 bytes per int + + kernel( + grid=(grid_size,), + block=(block_size,), + args=(d_part0, d_part1, d_cont_mat, n, k0, k1), + shared_mem=shared_mem_size + ) + + return cp.asnumpy(d_cont_mat) + + +@pytest.mark.parametrize("n, k0, k1", [ + (1000, 5, 5), + (10000, 10, 8), + (100000, 20, 15), +]) +def test_contingency_matrix(n, k0, k1): + # Generate random input data + rng = np.random.default_rng(42) + part0 = rng.integers(0, k0, size=n) + part1 = rng.integers(0, k1, size=n) + + # Compute contingency matrix using CUDA + cuda_result = contingency_matrix_cuda(part0, part1, k0, k1) + + # Compute contingency matrix using NumPy + numpy_result = get_contingency_matrix(part0, part1) + + # Assert that the results are equal + np.testing.assert_array_equal(cuda_result, numpy_result) \ No newline at end of file diff --git a/tests/gpu/excluded/test_device_host_funcs.py b/tests/gpu/excluded/test_device_host_funcs.py new file mode 100644 index 00000000..c40f4003 --- /dev/null +++ b/tests/gpu/excluded/test_device_host_funcs.py @@ -0,0 +1,231 @@ +import pytest +import math +import cupy as cp +import numpy as np +from ccc.sklearn.metrics_gpu import ( + d_get_confusion_matrix_str, + d_get_coords_from_index_str, + d_unravel_index_str, + d_get_contingency_matrix_str, + k_ari_str, +) +from ccc.coef import ( + get_coords_from_index, +) +from ccc.sklearn.metrics import ( + get_contingency_matrix, + get_pair_confusion_matrix, + adjusted_rand_index, +) + + +def test_get_coords_from_index_kernel(): + test_kernel_code = """ + extern "C" __global__ + void test_kernel(int n_obj, int* indices, int* results, int num_indices) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < num_indices) { + int x, y; + get_coords_from_index(n_obj, indices[tid], &x, &y); + results[tid * 2] = x; + results[tid * 2 + 1] = y; + } + } + """ + cuda_code = d_get_coords_from_index_str + test_kernel_code + module = cp.RawModule(code=cuda_code, backend='nvcc') + kernel = module.get_function("test_kernel") + + # Test parameters + n_obj = 10 + num_indices = 45 # (n_obj * (n_obj - 1)) // 2 + + # Create input indices + indices = cp.arange(num_indices, dtype=cp.int32) + + # Allocate memory for results + d_results = cp.empty(num_indices * 2, dtype=cp.int32) + + # Launch the kernel + threads_per_block = 256 + blocks = (num_indices + threads_per_block - 1) // threads_per_block + kernel((blocks,), (threads_per_block,), (n_obj, indices, d_results, num_indices)) + + # Get results back to host + h_results = cp.asnumpy(d_results) + + # Compare with Python implementation + for i in range(num_indices): + x_cuda, y_cuda = h_results[i * 2], h_results[i * 2 + 1] + x_py, y_py = get_coords_from_index(n_obj, i) + + assert x_cuda == x_py, f"Mismatch in x for index {i}: CUDA={x_cuda}, Python={x_py}" + assert y_cuda == y_py, f"Mismatch in y for index {i}: CUDA={y_cuda}, Python={y_py}" + + print("All tests passed successfully!") + + +@pytest.mark.parametrize("num_cols, num_indices", [ + (10, 45), + (15, 100), + (20, 200) +]) +def test_unravel_index_device(num_cols, num_indices): + test_kernel_code = """ + extern "C" __global__ void test_unravel_index_kernel(int* flat_indices, int* rows, int* cols, int num_cols, int num_indices) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < num_indices) { + unravel_index(flat_indices[tid], num_cols, &rows[tid], &cols[tid]); + } + } + """ + + cuda_code = d_unravel_index_str + test_kernel_code + # Compile the CUDA kernel + module = cp.RawModule(code=cuda_code, backend='nvcc') + kernel = module.get_function("test_unravel_index_kernel") + + # Create test inputs + flat_indices = cp.arange(num_indices, dtype=cp.int32) + + # Allocate memory for results (rows and cols) + d_rows = cp.zeros(num_indices, dtype=cp.int32) + d_cols = cp.zeros(num_indices, dtype=cp.int32) + + # Launch the kernel + threads_per_block = 256 + blocks = (num_indices + threads_per_block - 1) // threads_per_block + kernel((blocks,), (threads_per_block,), (flat_indices, d_rows, d_cols, num_cols, num_indices)) + + # Get results back to host + h_rows = cp.asnumpy(d_rows) + h_cols = cp.asnumpy(d_cols) + + # Compare with NumPy's unravel_index implementation + for i in range(num_indices): + # Use numpy.unravel_index as the reference + # row_py, col_py = divmod(i, num_cols) + row_py, col_py = np.unravel_index(i, (num_cols, num_cols)) + row_cuda, col_cuda = h_rows[i], h_cols[i] + + # Assertions to ensure CUDA and NumPy match + assert row_cuda == row_py, f"Mismatch in row for index {i}: CUDA={row_cuda}, NumPy={row_py}" + assert col_cuda == col_py, f"Mismatch in col for index {i}: CUDA={col_cuda}, NumPy={col_py}" + + print("All tests passed successfully!") + + +@pytest.mark.parametrize("n_objs", [100, 1000, 10000]) +@pytest.mark.parametrize("threads_per_block", [1, 2, 64, 128, 256, 512]) +@pytest.mark.parametrize("k", [3, 5, 10]) # Max value of a cluster number + 1 +def test_get_contingency_matrix_kernel(n_objs, threads_per_block, k): + test_kernel_code = """ + extern "C" + __global__ void test_kernel(int* part0, int* part1, int n_objs, int* cont_mat, int k) { + extern __shared__ int shared_cont_mat[]; + + // Call the function to compute contingency matrix in shared memory + get_contingency_matrix(part0, part1, n_objs, shared_cont_mat, k); + + // Copy shared memory back to global memory + int tid = threadIdx.x; + int num_threads = blockDim.x; + + for (int i = tid; i < k * k; i += num_threads) { + atomicAdd(&cont_mat[i], shared_cont_mat[i]); + } + } + """ + cuda_code = d_get_contingency_matrix_str + test_kernel_code + # Compile the CUDA kernel + module = cp.RawModule(code=cuda_code, backend='nvcc') + kernel = module.get_function("test_kernel") + + # Generate random partitions + part0 = np.random.randint(0, k, size=n_objs, dtype=np.int32) + part1 = np.random.randint(0, k, size=n_objs, dtype=np.int32) + + # Transfer data to GPU + d_part0 = cp.asarray(part0) + d_part1 = cp.asarray(part1) + d_cont_mat = cp.zeros((k, k), dtype=cp.int32) + + # Launch the kernel + blocks = 1 # Each pair of partitions is handled by only one block (to fully utilize shared memory) + shared_mem_size = k * k * 4 # 4 bytes per int + kernel((blocks,), (threads_per_block,), + (d_part0, d_part1, n_objs, d_cont_mat, k), + shared_mem=shared_mem_size) + + # Get results back to host + h_cont_mat = cp.asnumpy(d_cont_mat) + + # Compare with reference implementation + ref_cont_mat = get_contingency_matrix(part0, part1) + + np.testing.assert_array_equal(h_cont_mat, ref_cont_mat, + err_msg=f"CUDA and reference implementations do not match for n_objs={n_objs}, threads_per_block={threads_per_block}, k={k}") + print(f"Test passed successfully for n_objs={n_objs}, threads_per_block={threads_per_block}, k={k}") + + +@pytest.mark.parametrize("n_objs", [100]) +@pytest.mark.parametrize("threads_per_block", [32]) +@pytest.mark.parametrize("k", [3]) # Max value of a cluster number + 1 +def test_get_pair_confusion_matrix_device(n_objs, threads_per_block, k): + test_kernel_code = """ + extern "C" + __global__ void test_kernel(int* part0, int* part1, int n_objs, int k, int* out) { + extern __shared__ int shared_mem[]; + + // Call the function to compute contingency matrix in shared memory + int *s_contingency = shared_mem; + get_contingency_matrix(part0, part1, n_objs, s_contingency, k); + + int *s_sum_rows = s_contingency + k * k; + int *s_sum_cols = s_sum_rows + k; + int *C = s_sum_cols + k; + + get_pair_confusion_matrix(s_contingency, s_sum_rows, s_sum_cols, n_objs, k, C); + if (threadIdx.x == 0){ + for (int i = 0; i < 4; ++i){ + out[i] = C[i]; + } + } + __syncthreads(); + } + """ + + cuda_code = d_get_contingency_matrix_str + d_get_confusion_matrix_str + test_kernel_code + # Compile the CUDA kernel + module = cp.RawModule(code=cuda_code, backend='nvcc') + kernel = module.get_function("test_kernel") + + # Generate random partitions + np.random.seed(0) + part0 = np.random.randint(0, k, size=n_objs, dtype=np.int32) + part1 = np.random.randint(0, k, size=n_objs, dtype=np.int32) + print(f"part0: {part0}") + print(f"part1: {part1}") + + # Transfer data to GPU + d_part0 = cp.asarray(part0) + d_part1 = cp.asarray(part1) + d_c = cp.zeros((2, 2), dtype=cp.int32) + + # Launch the kernel + blocks = 1 # Each pair of partitions is handled by only one block (to fully utilize shared memory) + shared_mem_size = k * k * 4 # 4 bytes per int for the cont matrix + shared_mem_size += 2 * k * 4 # For the internal sum arrays + shared_mem_size += 4 * 4 # For the C matrix + kernel((blocks,), (threads_per_block,), + (d_part0, d_part1, n_objs, k, d_c), + shared_mem=shared_mem_size) + + h_c = cp.asnumpy(d_c) + py_c = get_pair_confusion_matrix(part0, part1) + ari_py = adjusted_rand_index(part0, part1) + print(f"ari_py: {ari_py}") + print(f"h_c: {h_c}") + print(f"py_c: {py_c}") + np.testing.assert_array_equal(h_c, py_c) + diff --git a/tests/gpu/excluded/test_get_parts.py b/tests/gpu/excluded/test_get_parts.py new file mode 100644 index 00000000..7693ef18 --- /dev/null +++ b/tests/gpu/excluded/test_get_parts.py @@ -0,0 +1,294 @@ +# This test file is used to verify the correctness of the GPU version of get_parts function +# Now we fall back to the original CPU implementation of get_parts function, so this test file is not used for now + +# import pytest +# +# import numpy as np +# import cupy as cp +# +# from ccc.coef.impl_gpu import ( +# get_parts, +# ) +# +# from ccc.coef import get_parts as get_parts_cpu +# from ccc.coef import get_perc_from_k as get_perc_from_k_cpu +# import functools +# +# +# def clean_gpu_memory(func): +# @functools.wraps(func) +# def wrapper(*args, **kwargs): +# try: +# return func(*args, **kwargs) +# finally: +# mempool = cp.get_default_memory_pool() +# mempool.free_all_blocks() +# return wrapper +# +# +# def find_partition(value, quantiles): +# for i in range(len(quantiles)): +# if value <= quantiles[i]: +# return i +# return len(quantiles) # If value is greater than all quantiles +# +# +# def verify_partition(feature, index, n_clusters): +# """ +# Verify the partition for a specific element in the feature array. +# """ +# parts_cpu = get_parts_cpu(feature, (n_clusters,)) +# percentages_cpu = get_perc_from_k_cpu(n_clusters) +# quantities = np.quantile(feature, percentages_cpu) +# +# value = feature[index] +# partition = find_partition(value, quantities) +# +# print(f"\nVerifying partition for feature[{index}] = {value}") +# print(f"CPU percentages: {percentages_cpu}") +# print(f"CPU quantities: {quantities}") +# +# print("\nAll partition ranges:") +# for i in range(n_clusters): +# if i == 0: +# print(f"Partition {i} range: (-inf, {quantities[i]}]") +# elif i == n_clusters - 1: +# print(f"Partition {i} range: ({quantities[i-1]}, inf)") +# else: +# print(f"Partition {i} range: ({quantities[i-1]}, {quantities[i]}]") +# +# print(f"Data point {value} should fall in partition {partition}") +# print(f"Partition computed by CCC_CPU: {parts_cpu[0][index]}") +# +# assert partition == parts_cpu[0][index], f"Mismatch in partition for feature[{index}]" +# return partition +# +# +# @clean_gpu_memory +# @pytest.mark.parametrize("feature_size", [100, 1000, 10000, 100000]) +# @pytest.mark.parametrize("cluster_settings", [ +# ([2], (2,)), +# ([2, 3], (2, 3)), +# ([2, 3, 4], (2, 3, 4)), +# ([5], (5,)), +# ([6], (6,)), +# ([9], (9,)), +# ([2, 3, 4, 5, 6, 7, 8, 9, 10], (2, 3, 4, 5, 6, 7, 8, 9, 10)), +# ]) +# @pytest.mark.parametrize("seed, distribution, params", [ +# (0, "rand", {}), # Uniform distribution +# (42, "randn", {}), # Normal distribution +# (123, "randint", {"low": 0, "high": 100}), # Integer distribution +# (456, "exponential", {"scale": 2.0}), # Exponential distribution +# ]) +# def test_get_parts(feature_size, cluster_settings, seed, distribution, params): +# # Given FP arithmetic is not associative and the difference between GPU and CPU FP arithmetic, +# # we need to allow for some tolerance. This is a tentative value that may need to be adjusted. +# # Note that the difference between GPU and CPU results is not expected to be larger than 1. +# n_diff_tolerance = int(feature_size * 0.04) +# +# np.random.seed(seed) +# +# gpu_clusters, cpu_clusters = cluster_settings +# +# # Generate random features based on the specified distribution +# if distribution == "rand": +# feature = np.random.rand(feature_size) +# elif distribution == "randn": +# feature = np.random.randn(feature_size) +# elif distribution == "randint": +# feature = np.random.randint(params["low"], params["high"], feature_size) +# elif distribution == "exponential": +# feature = np.random.exponential(params["scale"], feature_size) +# elif distribution == "binomial": +# feature = np.random.binomial(params["n"], params["p"], feature_size) +# else: +# raise ValueError(f"Unsupported distribution: {distribution}") +# +# # GPU implementation +# parts_gpu = get_parts(feature, np.array(gpu_clusters, dtype=np.uint8))[0].get() +# +# # CPU implementation +# parts_cpu = get_parts_cpu(feature, cpu_clusters) +# +# print(f"\nTesting with feature_size={feature_size}, clusters={gpu_clusters}, distribution={distribution}") +# print(f"GPU output shape: {parts_gpu.shape}") +# print(f"CPU output shape: {parts_cpu.shape}") +# +# assert parts_gpu is not None, "GPU output is None" +# assert len(parts_gpu) == 1, f"Expected 1 feature, got {len(parts_gpu)}" +# assert len(parts_gpu[0]) == len(gpu_clusters), f"Expected {len(gpu_clusters)} partition(s), got {len(parts_gpu[0])}" +# +# for i, n_clusters in enumerate(gpu_clusters): +# gpu_unique = np.unique(parts_gpu[0][i]) +# cpu_unique = np.unique(parts_cpu[i]) +# +# print(f"\nPartition {i}:") +# print(f" GPU unique values (partitions): {gpu_unique}") +# print(f" CPU unique values (partitions): {cpu_unique}") +# +# assert len(gpu_unique) == n_clusters, f"Expected {n_clusters} cluster indexes, got {len(gpu_unique)}" +# +# if not np.array_equal(parts_gpu[0][i], parts_cpu[i]): +# diff_indices = np.where(parts_gpu[0][i] != parts_cpu[i])[0] +# diff_values = np.abs(parts_gpu[0][i][diff_indices] - parts_cpu[i][diff_indices]) +# max_diff = np.max(diff_values) +# +# print(f"\nDifferences found in partition {i}:") +# print(f" Number of differing elements: {len(diff_indices)}") +# print(f" Maximum difference: {max_diff}") +# print(f" First 10 differing indices: {diff_indices[:10]}") +# print(f" GPU values at these indices: {parts_gpu[0][i][diff_indices[:10]]}") +# print(f" CPU values at these indices: {parts_cpu[i][diff_indices[:10]]}") +# print(f" Object values at these indices: {feature[diff_indices[:10]]}") +# +# if len(diff_indices) > n_diff_tolerance or max_diff > 1: +# # Verify partitions for differing elements +# for idx in diff_indices[:10]: +# expected_partition = verify_partition(feature, idx, n_clusters) +# assert parts_gpu[0][i][idx] == expected_partition, f"GPU partition mismatch for feature[{idx}]" +# +# assert False, f"GPU and CPU results don't match for {n_clusters} clusters: " \ +# f"diff count = {len(diff_indices)}, max diff = {max_diff}" +# else: +# print(f" Differences within tolerance (count <= {n_diff_tolerance} and max diff <= 1)") +# +# # Additional checks for multi-cluster settings +# if len(gpu_clusters) > 1: +# for i in range(len(gpu_clusters)): +# for j in range(i + 1, len(gpu_clusters)): +# if np.array_equal(parts_gpu[0][i], parts_cpu[j]): +# print(f"\nUnexpected equality between partitions {i} and {j}:") +# print(f" Partition {i}: {parts_gpu[0][i]}") +# print(f" Partition {j}: {parts_cpu[j]}") +# assert False, f"Partitions {i} and {j} should not be equal" +# +# +# def test_specific_elements(): +# mempool = cp.get_default_memory_pool() +# mempool.free_all_blocks() +# +# np.random.seed(0) +# feature = np.random.rand(100) +# assert feature[77] == 0.1201965612131689 +# assert feature[78] == 0.29614019752214493 +# +# verify_partition(feature, 77, 6) +# verify_partition(feature, 78, 6) +# +# +# @clean_gpu_memory +# def test_potential_buggy_cpu_impl(): +# +# np.random.seed(0) +# feature = np.random.rand(100) +# assert feature[77] == 0.1201965612131689 +# assert feature[78] == 0.29614019752214493 +# parts_cpu = get_parts_cpu(feature, (6, )) +# percentages_cpu = get_perc_from_k_cpu(6) +# quantities = np.quantile(feature, percentages_cpu) +# print() +# print(f"CPU parts: \n{parts_cpu}") +# print(f"CPU percentages: \n{percentages_cpu}") +# print(f"CPU quantities: \n{quantities}") +# +# # Find which partitions feature[77] and feature[78] fall into +# value_77 = feature[77] +# value_78 = feature[78] +# partition_77 = find_partition(value_77, quantities) +# partition_78 = find_partition(value_78, quantities) +# +# print(f"feature[77] = {value_77} falls in partition {partition_77}") +# print(f"feature[78] = {value_78} falls in partition {partition_78}") +# if partition_77 > 0: +# print(f"Partition {partition_77} range: ({quantities[partition_77-1]}, {quantities[partition_77]}]") +# else: +# print(f"Partition {partition_77} range: (-inf, {quantities[partition_77]}]") +# if partition_78 > 0: +# print(f"Partition {partition_78} range: ({quantities[partition_78-1]}, {quantities[partition_78]}]") +# else: +# print(f"Partition {partition_78} range: (-inf, {quantities[partition_78]}]") +# print(f"Partition computed by CCC_CPU for feature[77]: {parts_cpu[0][77]}") +# print(f"Partition computed by CCC_CPU for feature[78]: {parts_cpu[0][78]}") +# assert partition_77 == parts_cpu[0][77] +# assert partition_78 == parts_cpu[0][78] +# +# +# @clean_gpu_memory +# def test_get_parts_with_singletons(): +# +# np.random.seed(0) +# +# feature0 = np.array([1.3] * 100) +# +# # run +# parts = get_parts(feature0, np.array([2], dtype=np.uint8))[0].get() +# parts_cpu = get_parts_cpu(feature0, (2,)) +# assert parts is not None +# assert len(parts) == 1 # 1 feature +# assert len(parts[0]) == 1 # 1 partition +# # all the elements (2D) should be -2 +# np.testing.assert_array_equal(np.unique(parts[0]), np.array([-2])) +# assert np.array_equal(parts[0], parts_cpu) +# +# parts = get_parts(feature0, np.array([2, 3], dtype=np.uint8))[0].get() +# parts_cpu = get_parts_cpu(feature0, (2, 3)) +# assert parts is not None +# assert len(parts) == 1 +# assert len(parts[0]) == 2, "feature should have 2 clusters" +# np.testing.assert_array_equal(np.unique(parts[0][0]), np.array([-2])) +# np.testing.assert_array_equal(np.unique(parts[0][1]), np.array([-2])) +# assert np.array_equal(parts[0][0], parts_cpu[0]) +# assert np.array_equal(parts[0][1], parts_cpu[1]) +# +# +# @clean_gpu_memory +# def test_get_parts_with_categorical_feature(): +# np.random.seed(0) +# +# feature0 = np.array([4] * 10) +# +# # run +# # only one partition is requested +# parts = get_parts(feature0, np.array([2], dtype=np.uint8), data_is_numerical=False)[0].get() +# parts_cpu = get_parts_cpu(feature0, (2,), data_is_numerical=False) +# assert parts is not None +# assert len(parts) == 1 +# assert len(parts[0]) == 1 +# np.testing.assert_array_equal(np.unique(parts[0]), np.array([4])) +# assert np.array_equal(parts[0], parts_cpu) +# +# # more partitions are requested; only the first one has valid information +# parts = get_parts(feature0, np.array([2, 3], dtype=np.uint8), data_is_numerical=False)[0].get() +# parts_cpu = get_parts_cpu(feature0, (2, 3), data_is_numerical=False) +# assert parts is not None +# assert len(parts) == 1 +# assert len(parts[0]) == 2 +# print("parts:") +# print(parts) +# print("parts_cpu:") +# print(parts_cpu) +# np.testing.assert_array_equal(np.unique(parts[0][0]), np.array([4])) +# np.testing.assert_array_equal(np.unique(parts[0][1]), np.array([-1])) +# assert (parts == parts_cpu).all() +# assert np.array_equal(parts[0][0], parts_cpu[0]) +# assert np.array_equal(parts[0][1], parts_cpu[1]) +# +# +# @clean_gpu_memory +# def test_get_parts_2d_simple(): +# np.random.seed(0) +# array = np.random.rand(5, 1000) +# print(f"array : \n{array}") +# parts = get_parts(array, np.array([3], dtype=np.uint8))[0].get() +# parts_cpu_row0 = get_parts_cpu(array[0], (3, )) +# parts_cpu_row1 = get_parts_cpu(array[1], (3, )) +# assert parts is not None +# assert (parts[0] == parts_cpu_row0).all() +# assert (parts[1] == parts_cpu_row1).all() +# print("parts:") +# print(parts) +# print("parts_cpu_row0:") +# print(parts_cpu_row0) +# print("parts_cpu_row1:") +# print(parts_cpu_row1) diff --git a/tests/gpu/excluded/test_get_percentiles.py b/tests/gpu/excluded/test_get_percentiles.py new file mode 100644 index 00000000..45f54938 --- /dev/null +++ b/tests/gpu/excluded/test_get_percentiles.py @@ -0,0 +1,27 @@ +import pytest + +import numpy as np +from numpy.testing import assert_array_equal + +from ccc.coef.impl_gpu import ( + get_perc_from_k, +) + +from ccc.coef import get_perc_from_k as get_perc_from_k_cpu + + +def test_get_perc_from_k_with_k_less_than_two(): + empty_array = np.empty(0) + assert_array_equal(get_perc_from_k(1), empty_array) + assert_array_equal(get_perc_from_k(0), empty_array) + assert_array_equal(get_perc_from_k(-1), empty_array) + + +@pytest.mark.parametrize("k", [ + 2, 3, 4, 5, 6, 7, 8, 9, 10 +]) +def test_get_perc_from_k(k): + np.set_printoptions(precision=17) + gpu_result = get_perc_from_k(k) + cpu_result = get_perc_from_k_cpu(k) + assert np.allclose(gpu_result, cpu_result) diff --git a/tests/gpu/excluded/test_impl_gpu.py b/tests/gpu/excluded/test_impl_gpu.py new file mode 100644 index 00000000..eed46219 --- /dev/null +++ b/tests/gpu/excluded/test_impl_gpu.py @@ -0,0 +1,236 @@ +import pytest +import time + +import numpy as np + +from ccc.coef.impl_gpu import ccc as ccc_gpu +from ccc.coef.impl import ccc +from utils import clean_gpu_memory +# This test needs to be improved + + +def test_ccc_gpu_1d_simple(): + np.random.seed(0) + feature1 = np.random.rand(10) + feature2 = np.random.rand(10) + c1 = ccc_gpu(feature1, feature2) + c2 = ccc(feature1, feature2) + print(f"GPU: {c1}, CPU: {c2}") + assert np.isclose(c1, c2, atol=1e-3), f"GPU: {c1}, CPU: {c2}" + + +@clean_gpu_memory +def run_ccc_test(size, seed, distribution, params): + np.random.seed(seed) + absolute_tolerance = 1e-3 # allow 0.001 as max coefficient difference + + # Generate random features based on the specified distribution + if distribution == "rand": + random_feature1 = np.random.rand(size) + random_feature2 = np.random.rand(size) + elif distribution == "randn": + random_feature1 = np.random.randn(size) + random_feature2 = np.random.randn(size) + elif distribution == "randint": + random_feature1 = np.random.randint(params["low"], params["high"], size) + random_feature2 = np.random.randint(params["low"], params["high"], size) + elif distribution == "exponential": + random_feature1 = np.random.exponential(params["scale"], size) + random_feature2 = np.random.exponential(params["scale"], size) + else: + raise ValueError(f"Unsupported distribution: {distribution}") + + c1 = ccc_gpu(random_feature1, random_feature2) + c2 = ccc(random_feature1, random_feature2) + + is_close = np.isclose(c1, c2, atol=absolute_tolerance) + return is_close, c1, c2 + + +@pytest.mark.parametrize( + "distribution, params", + [ + ("rand", {}), # Uniform distribution + ("randn", {}), # Normal distribution + ( + "randint", + {"low": 0, "high": 100}, + ), # Integer distribution, expect to have the largest difference due to partition errors + ("exponential", {"scale": 2.0}), # Exponential distribution + ], +) +def test_ccc_gpu_1d(distribution, params): + """ + This test allows for a small percentage (10%) of individual tests to fail for each distribution. + """ + sizes = np.linspace(100, 100000, num=5, dtype=int) + seeds = np.linspace(0, 1000, num=5, dtype=int) + allowed_failure_rate = 0.10 # 10% allowed failure rate + + total_tests = len(sizes) * len(seeds) + max_allowed_failures = int(total_tests * allowed_failure_rate) + failures = 0 + + for size in sizes: + for seed in seeds: + is_close, c1, c2 = run_ccc_test(size, seed, distribution, params) + + if not np.all(is_close): + failures += 1 + print( + f"\nTest failed for size={size}, seed={seed}, distribution={distribution}" + ) + print(f"GPU result: {c1}") + print(f"CPU result: {c2}") + print(f"Differences: {np.abs(c1 - c2)}") + + print(f"\nDistribution: {distribution}") + print(f"Total tests: {total_tests}") + print(f"Failed tests: {failures}") + print(f"Maximum allowed failures: {max_allowed_failures}") + + assert ( + failures <= max_allowed_failures + ), f"Too many failures for {distribution} distribution: {failures} > {max_allowed_failures}" + + if failures > 0: + print( + f"Warning: {failures} tests failed, but within the allowed failure rate of {allowed_failure_rate * 100}%" + ) + else: + print("All tests passed successfully") + + +# Additional test for edge cases +@clean_gpu_memory +@pytest.mark.parametrize( + "case", + [ + "identical", + "opposite", + "constant", + "single_value", + ], +) +def test_ccc_gpu_1d_edge_cases(case): + if case == "identical": + feature = np.random.rand(1000) + ccc_gpu(feature, feature) + ccc(feature, feature) + elif case == "opposite": + feature = np.random.rand(1000) + ccc_gpu(feature, -feature) + ccc(feature, -feature) + elif case == "constant": + feature1 = np.full(1000, 5) + feature2 = np.full(1000, 3) + ccc_gpu(feature1, feature2) + ccc(feature1, feature2) + elif case == "single_value": + # Too few objects + feature = np.array([1]) + with pytest.raises(ValueError) as e: + ccc_gpu(feature, feature) + assert "Too few objects" in e.value + with pytest.raises(ValueError) as e: + ccc(feature, feature) + assert "Too few objects" in e.value + return + + +@clean_gpu_memory +def test_ccc_gpu_2d_simple(): + np.random.seed(0) + shape = (20, 200) # 200 features, 1,000 samples + print(f"Testing with {shape[0]} features and {shape[1]} samples") + df = np.random.rand(*shape) + + # Time GPU version + start_gpu = time.time() + c1 = ccc_gpu(df) + end_gpu = time.time() + gpu_time = end_gpu - start_gpu + + # Time CPU version + start_cpu = time.time() + c2 = ccc(df) + end_cpu = time.time() + cpu_time = end_cpu - start_cpu + + # Calculate speedup + speedup = cpu_time / gpu_time + + print(f"\nGPU time: {gpu_time:.4f} seconds") + print(f"CPU time: {cpu_time:.4f} seconds") + print(f"Speedup: {speedup:.2f}x") + + print(f"GPU coef:\n {c1}") + print(f"CPU coef:\n {c2}") + + assert np.allclose(c1, c2, rtol=1e-5, atol=1e-5) + + return gpu_time, cpu_time + + +# Test for very large arrays (may be slow and memory-intensive) +@clean_gpu_memory +@pytest.mark.slow +def test_ccc_gpu_2d_very_large(): + np.random.seed(0) + shape = (200, 1000) # 200 features, 1,000 samples + print(f"Testing with {shape[0]} features and {shape[1]} samples") + df = np.random.rand(*shape) + + # Time GPU version + start_gpu = time.time() + c1 = ccc_gpu(df) + end_gpu = time.time() + gpu_time = end_gpu - start_gpu + + # Time CPU version + start_cpu = time.time() + c2 = ccc(df) + end_cpu = time.time() + cpu_time = end_cpu - start_cpu + + # Calculate speedup + speedup = cpu_time / gpu_time + + print(f"Length of the array: {len(c1)}") + print(f"\nGPU time: {gpu_time:.4f} seconds") + print(f"CPU time: {cpu_time:.4f} seconds") + print(f"Speedup: {speedup:.2f}x") + + # Set tolerance parameters + rtol = 1e-5 + atol = 1e-2 + max_diff_count = int(len(c1) * 0.01) # Allow up to 1% of elements to be different + + # Compare results + is_close = np.isclose(c1, c2, rtol=rtol, atol=atol) + diff_count = np.sum(~is_close) + + print(f"Number of differing elements: {diff_count}") + print(f"Maximum allowed differences: {max_diff_count}") + + if diff_count > 0: + # Find indices of differing elements + diff_indices = np.where(~is_close) + + # Print details of the first 10 differences + print("\nFirst 10 differences:") + for i in range(min(10, diff_count)): + idx = tuple(index[i] for index in diff_indices) + print( + f"Index {idx}: GPU = {c1[idx]:.8f}, CPU = {c2[idx]:.8f}, Diff = {abs(c1[idx] - c2[idx]):.8f}" + ) + + # Calculate and print max absolute difference + max_abs_diff = np.max(np.abs(c1 - c2)) + print(f"\nMaximum absolute difference: {max_abs_diff:.8f}") + + assert ( + diff_count <= max_diff_count + ), f"Too many differing elements: {diff_count} > {max_diff_count}" + + return gpu_time, cpu_time, speedup diff --git a/tests/gpu/excluded/test_sklearn_metrics_gpu.py b/tests/gpu/excluded/test_sklearn_metrics_gpu.py new file mode 100644 index 00000000..1cc65462 --- /dev/null +++ b/tests/gpu/excluded/test_sklearn_metrics_gpu.py @@ -0,0 +1,129 @@ +# import numpy as np +# from sklearn.metrics import adjusted_rand_score as sklearn_ari +# +# from ccc.sklearn.metrics_gpu import ( +# adjusted_rand_index, +# get_contingency_matrix, +# get_pair_confusion_matrix, +# ) +# +# +# def test_get_contingency_matrix_k0_equal_k1(): +# part0 = np.array([0, 0, 1, 1, 2, 2]) +# part1 = np.array([0, 1, 0, 2, 1, 2]) +# +# expected_mat = np.array([[1, 1, 0], [1, 0, 1], [0, 1, 1]]) +# +# observed_mat = get_contingency_matrix(part0, part1) +# +# np.testing.assert_array_equal(observed_mat, expected_mat) +# +# +# def test_get_contingency_matrix_k0_greater_k1(): +# part0 = np.array([0, 0, 1, 1, 2, 2, 3, 3, 3]) +# part1 = np.array([0, 1, 0, 2, 1, 2, 2, 2, 2]) +# +# expected_mat = np.array([[1, 1, 0], [1, 0, 1], [0, 1, 1], [0, 0, 3]]) +# +# observed_mat = get_contingency_matrix(part0, part1) +# +# np.testing.assert_array_equal(observed_mat, expected_mat) +# +# +# def test_get_contingency_matrix_k0_lesser_k1(): +# part0 = np.array([0, 0, 1, 1, 2, 2, 3, 3, 3, 2, 2, 2, 1]) +# part1 = np.array([0, 1, 0, 2, 1, 2, 3, 3, 3, 4, 4, 5, 5]) +# +# expected_mat = np.array( +# [[1, 1, 0, 0, 0, 0], [1, 0, 1, 0, 0, 1], [0, 1, 1, 0, 2, 1], [0, 0, 0, 3, 0, 0]] +# ) +# +# observed_mat = get_contingency_matrix(part0, part1) +# +# np.testing.assert_array_equal(observed_mat, expected_mat) +# +# +# def test_get_pair_confusion_matrix_k0_equal_k1(): +# part0 = np.array([0, 0, 1, 1, 2, 2]) +# part1 = np.array([0, 1, 0, 2, 1, 2]) +# +# expected_mat = np.array([[18, 6], [6, 0]]) +# +# observed_mat = get_pair_confusion_matrix(part0, part1) +# +# np.testing.assert_array_equal(observed_mat, expected_mat) +# +# +# def test_get_pair_confusion_matrix_k0_greater_k1(): +# part0 = np.array([0, 0, 1, 1, 2, 2, 3, 3, 3]) +# part1 = np.array([0, 1, 0, 2, 1, 2, 2, 2, 2]) +# +# expected_mat = np.array([[42, 18], [6, 6]]) +# +# observed_mat = get_pair_confusion_matrix(part0, part1) +# +# np.testing.assert_array_equal(observed_mat, expected_mat) +# +# +# def test_adjusted_rand_index_manual_random_partitions_same_k(): +# part0 = np.array([0, 0, 1, 1, 2, 2]) +# part1 = np.array([0, 1, 0, 2, 1, 2]) +# +# expected_ari = -0.25 +# +# observed_ari = adjusted_rand_index(part0, part1) +# observed_ari_symm = adjusted_rand_index(part1, part0) +# +# assert observed_ari == observed_ari_symm +# assert expected_ari == observed_ari +# +# +# def test_adjusted_rand_index_manual_perfect_match(): +# part0 = np.array([0, 0, 1, 1, 2, 2]) +# part1 = np.array([2, 2, 3, 3, 4, 4]) +# +# expected_ari = 1.0 +# +# observed_ari = adjusted_rand_index(part0, part1) +# observed_ari_symm = adjusted_rand_index(part1, part0) +# +# assert observed_ari == observed_ari_symm +# assert expected_ari == observed_ari +# +# +# def test_adjusted_rand_index_random_partitions_same_k(): +# maxk0 = 2 +# maxk1 = maxk0 +# n = 100 +# +# part0 = np.random.randint(0, maxk0 + 1, n) +# part1 = np.random.randint(0, maxk1 + 1, n) +# +# # warning: the sklearn's ari implementation can overflow in older versions +# # when n is large +# expected_ari = sklearn_ari(part0, part1) +# +# observed_ari = adjusted_rand_index(part0, part1) +# observed_ari_symm = adjusted_rand_index(part1, part0) +# +# assert observed_ari == observed_ari_symm +# assert expected_ari == observed_ari +# +# +# def test_adjusted_rand_index_random_partitions_k0_greater_k1(): +# maxk0 = 5 +# maxk1 = 3 +# n = 100 +# +# part0 = np.random.randint(0, maxk0 + 1, n) +# part1 = np.random.randint(0, maxk1 + 1, n) +# +# # warning: the sklearn's ari implementation can overflow in older versions +# # when n is large +# expected_ari = sklearn_ari(part0, part1) +# +# observed_ari = adjusted_rand_index(part0, part1) +# observed_ari_symm = adjusted_rand_index(part1, part0) +# +# assert observed_ari == observed_ari_symm +# assert expected_ari == observed_ari diff --git a/tests/gpu/test_ari_gpu.py b/tests/gpu/test_ari_gpu.py new file mode 100644 index 00000000..9b847d12 --- /dev/null +++ b/tests/gpu/test_ari_gpu.py @@ -0,0 +1,93 @@ +import pytest +import numpy as np +import ccc_cuda_ext + +from ccc.sklearn.metrics import ( + get_contingency_matrix, + get_pair_confusion_matrix, + adjusted_rand_index, +) + + +# Test cases taken from sklearn.metrics.adjusted_rand_score +@pytest.mark.parametrize("parts, expected_ari", [ + ( + np.array([ + [[0, 0, 1, 2]], + [[0, 0, 1, 1]] + ], dtype=np.int32), + 0.57 + ), + ( + np.array([ + [[0, 0, 1, 1]], + [[0, 1, 0, 1]] + ], dtype=np.int32), + -0.5 + ), + ( + np.array([ + [[0, 0, 1, 1]], + [[0, 0, 1, 1]] + ], dtype=np.int32), + 1.0 + ), + ( + np.array([ + [[0, 0, 1, 1]], + [[1, 1, 0, 0]] + ], dtype=np.int32), + 1.0 + ), + ( + np.array([ + [[0, 0, 0, 0]], + [[0, 1, 2, 3]] + ], dtype=np.int32), + 0.0 + ) +]) +def test_simple_ari_results(parts, expected_ari): + n_features, n_parts, n_objs = parts.shape + res = ccc_cuda_ext.ari_int32(parts, n_features, n_parts, n_objs) + assert np.isclose(res[0], expected_ari, atol=1e-2) + + +def generate_pairwise_combinations(arr): + pairs = [] + num_slices = arr.shape[0] # Number of 2D arrays in the 3D array + + for i in range(num_slices): + for j in range(i + 1, num_slices): # Only consider pairs in different slices + for row_i in arr[i]: # Each row in slice i + for row_j in arr[j]: # Pairs with each row in slice j + pairs.append([row_i, row_j]) + + # Convert list of pairs to a NumPy array + return np.array(pairs) + + +# Test ari generation given a full 3D array of partitions +@pytest.mark.parametrize("n_features, n_parts, n_objs, k", [ + (2, 2, 100, 10), + (5, 10, 200, 10), + # (100, 20, 1000, 10), # wrong results + # (200, 20, 300, 10), # illegal mem access + # (1000, 10, 300, 10), # out of gpu mem +]) +def test_pairwise_ari(n_features, n_parts, n_objs, k): + parts = np.random.randint(0, k, size=(n_features, n_parts, n_objs), dtype=np.int32) + # Create test inputs + n_features, n_parts, n_objs = parts.shape + n_feature_comp = n_features * (n_features - 1) // 2 + n_aris = n_feature_comp * n_parts * n_parts + ref_aris = np.zeros(n_aris, dtype=np.float32) + # Get partition pairs + pairs = generate_pairwise_combinations(parts) + # Use map-reduce to compute ARIs for all pairs of partitions + for i, (part0, part1) in enumerate(pairs): + ari = adjusted_rand_index(part0, part1) + ref_aris[i] = ari + # Compute ARIs using CUDA + res_aris = ccc_cuda_ext.ari_int32(parts, n_features, n_parts, n_objs) + assert np.allclose(res_aris, ref_aris) diff --git a/tests/gpu/utils.py b/tests/gpu/utils.py new file mode 100644 index 00000000..f7a88f0f --- /dev/null +++ b/tests/gpu/utils.py @@ -0,0 +1,13 @@ +import functools +import cupy as cp + + +def clean_gpu_memory(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + finally: + mempool = cp.get_default_memory_pool() + mempool.free_all_blocks() + return wrapper diff --git a/tests/test_coef.py b/tests/test_coef.py index 359c2728..47930f4c 100644 --- a/tests/test_coef.py +++ b/tests/test_coef.py @@ -894,7 +894,7 @@ def test_cm_return_parts_categorical_variable(): numerical_feature0_median = np.percentile(numerical_feature0, 50) # create a categorical variable perfectly correlated with the numerical one (this is actually an ordinal feature) - categorical_feature1 = np.full(numerical_feature0.shape[0], "", dtype=np.unicode_) + categorical_feature1 = np.full(numerical_feature0.shape[0], "", dtype=np.str_) categorical_feature1[numerical_feature0 < numerical_feature0_median] = "l" categorical_feature1[numerical_feature0 >= numerical_feature0_median] = "u" _unique_values = np.unique(categorical_feature1) @@ -1248,7 +1248,7 @@ def test_cm_numerical_and_categorical_features_perfect_relationship(): numerical_feature0_median = np.percentile(numerical_feature0, 50) # create a categorical variable perfectly correlated with the numerical one (this is actually an ordinal feature) - categorical_feature1 = np.full(numerical_feature0.shape[0], "", dtype=np.unicode_) + categorical_feature1 = np.full(numerical_feature0.shape[0], "", dtype=np.str_) categorical_feature1[numerical_feature0 < numerical_feature0_median] = "l" categorical_feature1[numerical_feature0 >= numerical_feature0_median] = "u" _unique_values = np.unique(categorical_feature1) @@ -1275,7 +1275,7 @@ def test_cm_numerical_and_categorical_features_strong_relationship(): numerical_feature0_perc = np.percentile(numerical_feature0, 25) # create a categorical variable perfectly correlated with the numerical one (this is actually an ordinal feature) - categorical_feature1 = np.full(numerical_feature0.shape[0], "", dtype=np.unicode_) + categorical_feature1 = np.full(numerical_feature0.shape[0], "", dtype=np.str_) categorical_feature1[numerical_feature0 < numerical_feature0_perc] = "l" categorical_feature1[numerical_feature0 >= numerical_feature0_perc] = "u" _unique_values = np.unique(categorical_feature1) @@ -1301,7 +1301,7 @@ def test_cm_numerical_and_categorical_features_no_relationship(): numerical_feature0 = np.random.rand(100) # create a categorical variable perfectly correlated with the numerical one (this is actually an ordinal feature) - categorical_feature1 = np.full(numerical_feature0.shape[0], "", dtype=np.unicode_) + categorical_feature1 = np.full(numerical_feature0.shape[0], "", dtype=np.str_) categorical_feature1[numerical_feature0 < 0.50] = "l" categorical_feature1[numerical_feature0 >= 0.50] = "u" np.random.shuffle(categorical_feature1) @@ -1377,7 +1377,7 @@ def test_cm_numerical_and_categorical_features_with_pandas_dataframe_two_feature numerical_feature0_median = np.percentile(numerical_feature0, 50) # create a categorical variable perfectly correlated with the numerical one (this is actually an ordinal feature) - categorical_feature1 = np.full(numerical_feature0.shape[0], "", dtype=np.unicode_) + categorical_feature1 = np.full(numerical_feature0.shape[0], "", dtype=np.str_) categorical_feature1[numerical_feature0 < numerical_feature0_median] = "l" categorical_feature1[numerical_feature0 >= numerical_feature0_median] = "u" _unique_values = np.unique(categorical_feature1)