AcademySoftwareFoundation
diff --git a/‎Makefile‎
Lines changed: 5 additions & 0 deletions b/‎Makefile‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎site/spi/Makefile-bits-arnold‎
Lines changed: 5 additions & 0 deletions b/‎site/spi/Makefile-bits-arnold‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/cmake/compiler.cmake‎
Lines changed: 2 additions & 0 deletions b/‎src/cmake/compiler.cmake‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cmake/externalpackages.cmake‎
Lines changed: 24 additions & 0 deletions b/‎src/cmake/externalpackages.cmake‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎src/include/OpenImageIO/imagebuf.h‎
Lines changed: 3 additions & 0 deletions b/‎src/include/OpenImageIO/imagebuf.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/include/OpenImageIO/platform.h‎
Lines changed: 2 additions & 0 deletions b/‎src/include/OpenImageIO/platform.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/libOpenImageIO/CMakeLists.txt‎
Lines changed: 22 additions & 4 deletions b/‎src/libOpenImageIO/CMakeLists.txt‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎src/libOpenImageIO/imagebuf.cpp‎
Lines changed: 44 additions & 7 deletions b/‎src/libOpenImageIO/imagebuf.cpp‎
Lines changed: 44 additions & 7 deletions
diff --git a/‎src/libOpenImageIO/imagebufalgo_addsub.cpp‎
Lines changed: 18 additions & 0 deletions b/‎src/libOpenImageIO/imagebufalgo_addsub.cpp‎
Lines changed: 18 additions & 0 deletions
@@ -239,6 +239,10 @@ ifneq (${BUILD_OIIOUTIL_ONLY},)
 MY_CMAKE_FLAGS += -DBUILD_OIIOUTIL_ONLY:BOOL=${BUILD_OIIOUTIL_ONLY}
 endif
 
+ifneq (${USE_CUDA},)
+MY_CMAKE_FLAGS += -DUSE_CUDA:BOOL=${USE_CUDA}
+endif
+
 ifdef DEBUG
 MY_CMAKE_FLAGS += -DCMAKE_BUILD_TYPE:STRING=Debug
 endif
@@ -510,6 +514,7 @@ help:
 	@echo "      USE_OPENCV=0             Skip anything that needs OpenCV"
 	@echo "      USE_PTEX=0               Skip anything that needs PTex"
 	@echo "      USE_FREETYPE=0           Skip anything that needs Freetype"
+	@echo "      USE_CUDA=1               Build NVIDIA CUDA support (if found)"
 	@echo "  OIIO build-time options:"
 	@echo "      INSTALL_PREFIX=path      Set installation prefix (default: ./${INSTALL_PREFIX_BRIEF})"
 	@echo "      NAMESPACE=name           Override namespace base name (default: OpenImageIO)"
 
@@ -120,6 +120,11 @@ ifeq (${SP_OS}, rhel7)
 	    -DOPENEXR_CUSTOM_LIB_DIR=/usr/lib64/OpenEXR2
     endif
 
+    # CUDA customizations
+    MY_CMAKE_FLAGS += \
+	-DCUDA_TOOLKIT_ROOT_DIR=/net/soft_scratch/apps/arnold/tools/nvidia/cuda9.1 \
+	-DCUDA_HOST_COMPILER=/shots/spi/home/lib/arnold/rhel7/llvm_4.0_final/bin/clang++
+
     MY_CMAKE_FLAGS += \
 	-DOCIO_PATH=${OCIO_PATH} \
 	-DFIELD3D_HOME=${FIELD3D_HOME} \
 
@@ -22,6 +22,8 @@ set (CLANG_TIDY_CHECKS "-*" CACHE STRING "clang-tidy checks to perform")
 set (CLANG_TIDY_ARGS "" CACHE STRING "clang-tidy args")
 option (CLANG_TIDY_FIX "Have clang-tidy fix source" OFF)
 set (GLIBCXX_USE_CXX11_ABI "" CACHE STRING "For gcc, use the new C++11 library ABI (0|1)")
+option (USE_CUDA "Use CUDA if found" OFF)
+set (CUDA_TARGET_ARCH "sm_30" CACHE STRING "CUDA GPU architecture (e.g. sm_35)")
 
 # Figure out which compiler we're using
 if (CMAKE_COMPILER_IS_GNUCC)
 
@@ -4,6 +4,7 @@
 # When not in VERBOSE mode, try to make things as quiet as possible
 if (NOT VERBOSE)
     set (Boost_FIND_QUIETLY true)
+    set (CUDA_FIND_QUIETLY true)
     set (DCMTK_FIND_QUIETLY true)
     set (FFmpeg_FIND_QUIETLY true)
     set (Field3D_FIND_QUIETLY true)
@@ -624,4 +625,27 @@ endmacro()
 
 
 ###########################################################################
+if (USE_CUDA)
+    if (NOT CUDA_TOOLKIT_ROOT_DIR AND NOT $ENV{CUDA_TOOLKIT_ROOT_DIR} STREQUAL "")
+        set (CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_TOOLKIT_ROOT_DIR})
+    endif ()
+    if (NOT CUDA_FIND_QUIETLY)
+        message (STATUS "CUDA_TOOLKIT_ROOT_DIR = ${CUDA_TOOLKIT_ROOT_DIR}")
+    endif ()
+    set (CUDA_PROPAGATE_HOST_FLAGS ON)
+    set (CUDA_VERBOSE_BUILD ${VERBOSE})
+    find_package (CUDA 7.0 REQUIRED)
+    list (APPEND CUDA_NVCC_FLAGS ${CSTD_FLAGS} -expt-relaxed-constexpr)
+    set (CUDA_INCLUDE_DIR ${CUDA_TOOLKIT_ROOT_DIR}/include)
+    message (STATUS "CUDA version = ${CUDA_VERSION}")
+    if (NOT CUDA_FIND_QUIETLY)
+        message (STATUS "CUDA includes  = ${CUDA_INCLUDE_DIR}")
+        message (STATUS "CUDA libraries = ${CUDA_LIBRARIES}")
+        message (STATUS "CUDA host compiler = ${CUDA_HOST_COMPILER}")
+        message (STATUS "CUDA nvcc flags = ${CUDA_NVCC_FLAGS}")
+    endif ()
+endif ()
+
+# end Cuda
+###########################################################################
 
@@ -183,6 +183,9 @@ class OIIO_API ImageBuf {
     /// Which type of storage is being used for the pixels?
     IBStorage storage () const;
 
+    /// Is the pixel memory of this ImageBuf visible to Cuda?
+    bool cuda_storage () const;
+
     /// Is this ImageBuf object initialized?
     bool initialized () const;
 
 
@@ -314,8 +314,10 @@
 // OIIO_HOSTDEVICE is used to supply the function decorators needed when
 // compiling for CUDA devices.
 #ifdef __CUDACC__
+#  define OIIO_HOST __host__
 #  define OIIO_HOSTDEVICE __host__ __device__
 #else
+#  define OIIO_HOST
 #  define OIIO_HOSTDEVICE
 #endif
 
 
@@ -4,6 +4,12 @@ endif ()
 file (TO_NATIVE_PATH "${PLUGIN_SEARCH_PATH}" PLUGIN_SEARCH_PATH_NATIVE)
 configure_file (imageio_pvt.h.in "${CMAKE_BINARY_DIR}/include/imageio_pvt.h" @ONLY)
 
+if (BUILDSTATIC)
+    set (OIIO_LIB_TYPE "STATIC")
+else ()
+    set (OIIO_LIB_TYPE "SHARED")
+endif ()
+
 file (GLOB libOpenImageIO_hdrs ../include/OpenImageIO/*.h)
 
 if (NOT USE_EXTERNAL_PUGIXML)
@@ -47,6 +53,7 @@ set (libOpenImageIO_srcs
                           imageoutput.cpp iptc.cpp xmp.cpp
                           color_ocio.cpp
                           maketexture.cpp
+                          imageio_cuda.cpp
                           ../libutil/argparse.cpp
                           ../libutil/benchmark.cpp
                           ../libutil/errorhandler.cpp 
@@ -73,6 +80,13 @@ set (libOpenImageIO_srcs
                           ${libOpenImageIO_hdrs}
                          )
 
+#set (cuda_using_srcs imagebufalgo_addsub.cpp)
+
+if (USE_CUDA)
+    file (GLOB gpu_source_files "*.cu")
+    message (STATUS "Extra cuda files: ${gpu_source_files}")
+    list (APPEND libOpenImageIO_srcs ${gpu_source_files})
+endif ()
 
 # If the 'EMBEDPLUGINS' option is set, we want to compile the source for
 # all the plugins into libOpenImageIO.
@@ -100,10 +114,12 @@ endif ()
 source_group ("libutil"    REGULAR_EXPRESSION ".+/libutil/.+")
 source_group ("libtexture" REGULAR_EXPRESSION ".+/libtexture/.+")
 
-if (BUILDSTATIC)
-    add_library (OpenImageIO STATIC ${libOpenImageIO_srcs})
+if (USE_CUDA)
+    add_definitions ("-DOIIO_USE_CUDA=1")
+    cuda_add_library (OpenImageIO ${OIIO_LIB_TYPE} ${libOpenImageIO_srcs}
+                      OPTIONS -arch ${CUDA_TARGET_ARCH})
 else ()
-    add_library (OpenImageIO SHARED ${libOpenImageIO_srcs})
+    add_library (OpenImageIO ${OIIO_LIB_TYPE} ${libOpenImageIO_srcs})
 endif ()
 
 # if (SANITIZE AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
@@ -206,7 +222,9 @@ if (EXTRA_DSO_LINK_ARGS)
     set_target_properties (OpenImageIO PROPERTIES LINK_FLAGS ${EXTRA_DSO_LINK_ARGS})
 endif()
 
-oiio_install_targets (OpenImageIO)
+oiio_install_targets (OpenImageIO 
+#${OIIO_CUDA_LIB}
+)
 
 
 # Testing
 
@@ -144,6 +144,7 @@ class ImageBufImpl {
     void append_error (const std::string& message) const;
 
     ImageBuf::IBStorage storage () const { return m_storage; }
+    bool cuda_storage () const { return m_cuda_storage; }
 
     TypeDesc pixeltype () const {
         validate_spec ();
@@ -194,6 +195,7 @@ class ImageBufImpl {
                               m_current_subimage, m_current_miplevel);
     }
 
+    // Make sure the pixels are ready to read with an iterator.
     bool validate_pixels () const {
         if (m_pixels_valid)
             return true;
@@ -259,6 +261,7 @@ class ImageBufImpl {
     mutable spin_mutex m_valid_mutex;
     mutable bool m_spec_valid;   ///< Is the spec valid
     mutable bool m_pixels_valid; ///< Image is valid
+    mutable bool m_cuda_storage = false; ///< Is the pixel memory visible to Cuda?
     bool m_badfile;              ///< File not found
     float m_pixelaspect;         ///< Pixel aspect ratio of the image
     size_t m_pixel_bytes;
@@ -373,6 +376,7 @@ ImageBufImpl::ImageBufImpl (const ImageBufImpl &src)
         // Source had the image fully in memory (no cache)
         if (m_storage == ImageBuf::APPBUFFER) {
             // Source just wrapped the client app's pixels, we do the same
+            m_allocated_size = src.m_localpixels ? src.spec().image_bytes() : 0;
             m_localpixels = src.m_localpixels;
         } else {
             // We own our pixels -- copy from source
@@ -498,7 +502,28 @@ ImageBufImpl::new_pixels (size_t size, const void *data)
     if (m_allocated_size)
         free_pixels();
     m_allocated_size = size;
-    m_pixels.reset (size ? new char [size] : nullptr);
+    m_cuda_storage = false;
+    if (size) {
+#ifdef OIIO_USE_CUDA
+        if (OIIO::get_int_attribute("cuda") && m_spec.format == TypeFloat) {
+            char *cudaptr = (char *)OIIO::pvt::cuda_malloc (m_allocated_size);
+            if (cudaptr) {
+                OIIO::debug ("IB Cuda allocated %p\n", (void*)cudaptr);
+                m_pixels.reset (cudaptr);
+                m_cuda_storage = true;
+            }
+            else {
+                OIIO::debug ("Requested cudaMallocManaged of %s FAILED\n",
+                             m_allocated_size);
+            }
+        }
+#endif
+        if (! m_pixels) {
+            m_pixels.reset (new char [size]);
+        }
+    } else {
+        m_pixels.reset ();
+    }
     IB_local_mem_current += m_allocated_size;
     if (data && size)
         memcpy (m_pixels.get(), data, size);
@@ -515,12 +540,21 @@ void
 ImageBufImpl::free_pixels ()
 {
     IB_local_mem_current -= m_allocated_size;
+#if OIIO_USE_CUDA
+    if (m_cuda_storage) {
+        OIIO::debug ("IB Cuda free %p\n", (void*)m_pixels.get());
+        OIIO::pvt::cuda_free (m_pixels.release());
+        m_cuda_storage = false;
+    }
+#endif
     m_pixels.reset ();
     if (m_allocated_size && pvt::oiio_print_debug > 1)
         OIIO::debug ("IB freed %d MB, global IB memory now %d MB\n",
                      m_allocated_size>>20, IB_local_mem_current>>20);
     m_allocated_size = 0;
     m_storage = ImageBuf::UNINITIALIZED;
+    m_localpixels = nullptr;
+    m_pixels_valid = false;
 }
 
 
@@ -577,6 +611,14 @@ ImageBuf::storage () const
 
 
 
+bool
+ImageBuf::cuda_storage () const
+{
+    return impl()->cuda_storage ();
+}
+
+
+
 void
 ImageBufImpl::clear ()
 {
@@ -588,10 +630,8 @@ ImageBufImpl::clear ()
     m_current_miplevel = -1;
     m_spec = ImageSpec ();
     m_nativespec = ImageSpec ();
-    m_pixels.reset ();
-    m_localpixels = NULL;
+    release_pixels ();
     m_spec_valid = false;
-    m_pixels_valid = false;
     m_badfile = false;
     m_pixelaspect = 1;
     m_pixel_bytes = 0;
@@ -700,8 +740,6 @@ ImageBufImpl::realloc ()
     m_channel_bytes = m_spec.format.size();
     m_blackpixel.resize (round_to_multiple (m_pixel_bytes, OIIO_SIMD_MAX_SIZE_BYTES), 0);
     // NB make it big enough for SSE
-    if (m_allocated_size)
-        m_pixels_valid = true;
     if (m_spec.deep) {
         m_deepdata.init (m_spec);
         m_storage = ImageBuf::LOCALBUFFER;
@@ -2440,5 +2478,4 @@ ImageBuf::retile (int x, int y, int z, ImageCache::Tile* &tile,
 }
 
 
-
 OIIO_NAMESPACE_END
@@ -44,6 +44,7 @@
 #include <OpenImageIO/deepdata.h>
 #include <OpenImageIO/dassert.h>
 #include "imageio_pvt.h"
+#include "imagebufalgo_cuda.h"
 
 
 
@@ -121,6 +122,15 @@ ImageBufAlgo::add (ImageBuf &dst, Image_or_Const A_, Image_or_Const B_,
             return false;
         ROI origroi = roi;
         roi.chend = std::min (roi.chend, std::min (A.nchannels(), B.nchannels()));
+
+#ifdef OIIO_USE_CUDA
+        if (dst.cuda_storage() && A.cuda_storage() && B.cuda_storage() &&
+                dst.roi() == roi && A.roi() == roi && B.roi() == roi) {
+            return pvt::add_impl_cuda (dst, A, B, dst.roi());
+        }
+// make >/dev/null && OPENIMAGEIO_LOG_TIMES=2 oiiotool -cuda -frames 1-1 -pattern fill:topleft=0,0,0:topright=0.5,0,0:bottomleft=0,0.5,0:bottomright=0.5,0.5,0.5 1920x1080 3 -pattern fill:topleft=0.5,0,0:topright=0,0.5,0:bottomleft=0.5,0.5,0.5:bottomright=0,.5,.5 1920x1080 3 -add -o out.exr
+#endif
+
         bool ok;
         OIIO_DISPATCH_COMMON_TYPES3 (ok, "add", add_impl, dst.spec().format,
                                      A.spec().format, B.spec().format,
@@ -209,6 +219,14 @@ ImageBufAlgo::sub (ImageBuf &dst, Image_or_Const A_, Image_or_Const B_,
             return false;
         ROI origroi = roi;
         roi.chend = std::min (roi.chend, std::min (A.nchannels(), B.nchannels()));
+
+#ifdef OIIO_USE_CUDA
+        if (dst.cuda_storage() && A.cuda_storage() && B.cuda_storage() &&
+                dst.roi() == roi && A.roi() == roi && B.roi() == roi) {
+            return pvt::sub_impl_cuda (dst, A, B, dst.roi());
+        }
+#endif
+
         bool ok;
         OIIO_DISPATCH_COMMON_TYPES3 (ok, "sub", sub_impl, dst.spec().format,
                                      A.spec().format, B.spec().format,