diff --git a/examples/gc.jl b/examples/gc.jl new file mode 100644 index 00000000..4c53ecaf --- /dev/null +++ b/examples/gc.jl @@ -0,0 +1,57 @@ +using CUDAdrv, CUDAnative +using Test + +mutable struct TempStruct + data::Float32 +end + +@noinline function escape(val) + Base.pointer_from_objref(val) +end + +function upload!(destination, source) + Mem.copy!(destination, pointer(source), sizeof(source)) +end + +function download(::Type{T}, source, dims) where T + result = Array{T}(undef, dims) + Mem.copy!(pointer(result), source, sizeof(result)) + result +end + +# Define a kernel that copies values using a temporary struct. +function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + for j in 1:2 + # Allocate a mutable struct and make sure it ends up on the GC heap. + temp = TempStruct(unsafe_load(a, i)) + escape(temp) + + # Allocate a large garbage buffer to force collections. + gc_malloc(Csize_t(256 * 1024)) + + # Use the mutable struct. If its memory has been reclaimed (by accident) + # then we expect the test at the end of this file to fail. + unsafe_store!(b, temp.data, i) + end + + return +end + +thread_count = 256 + +# Allocate two arrays. +source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count) +destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count) +source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array) +destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array) + +# Fill the source and destination arrays. +upload!(source_array, fill(42.f0, thread_count)) +upload!(destination_array, zeros(Float32, thread_count)) + +# Run the kernel. +@cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer) + +@test download(Float32, destination_array, thread_count) == fill(42.f0, thread_count) diff --git a/examples/interrupt-memory.jl b/examples/interrupt-memory.jl new file mode 100644 index 00000000..631bb6ce --- /dev/null +++ b/examples/interrupt-memory.jl @@ -0,0 +1,54 @@ +using CUDAdrv, CUDAnative +using Test + +function upload!(destination, source) + Mem.copy!(destination, pointer(source), sizeof(source)) +end + +function download(::Type{T}, source, dims) where T + result = Array{T}(undef, dims) + Mem.copy!(pointer(result), source, sizeof(result)) + result +end + +# Define a kernel that copies some data from one array to another. +# The host is invoked to populate the source array. +function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + interrupt_or_wait() + threadfence_system() + Base.unsafe_store!(b, Base.unsafe_load(a, i), i) + return +end + +thread_count = 64 + +# Allocate two arrays. +source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count) +destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count) +source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array) +destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array) + +# Zero-fill the source and destination arrays. +upload!(source_array, zeros(Float32, thread_count)) +upload!(destination_array, zeros(Float32, thread_count)) + +# Define one stream for kernel execution and another for +# data transfer. +data_stream = CuStream() +exec_stream = CuStream() + +# Define a magic value. +magic = 42.f0 + +# Configure the interrupt to fill the input array with the magic value. +function handle_interrupt() + upload!(source_array, fill(magic, thread_count), data_stream; async = true) + synchronize(data_stream) +end + +# Run the kernel. +@cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream kernel(source_pointer, destination_pointer) + +# Check that the destination buffer is as expected. +@test download(Float32, destination_array, thread_count) == fill(magic, thread_count) diff --git a/examples/interrupt.jl b/examples/interrupt.jl new file mode 100644 index 00000000..a1c8f81e --- /dev/null +++ b/examples/interrupt.jl @@ -0,0 +1,24 @@ +using CUDAdrv, CUDAnative +using Test + +# Define a kernel that makes the host count. +function kernel() + interrupt() + return +end + +thread_count = 64 + +# Configure the interrupt to increment a counter. +global counter = 0 +function handle_interrupt() + global counter + counter += 1 +end + +# Run the kernel. +@cuda_interruptible handle_interrupt threads=thread_count kernel() + +# Check that the counter's final value equals the number +# of threads. +@test counter == thread_count diff --git a/examples/linked-list.jl b/examples/linked-list.jl new file mode 100644 index 00000000..ecb802ac --- /dev/null +++ b/examples/linked-list.jl @@ -0,0 +1,88 @@ +using CUDAnative, CUDAdrv +using Test +import Base: foldl, reduce, sum + +# This test constructs a linked list in a GPU kernel. + +use_gc = true + +abstract type List{T} +end + +mutable struct Nil{T} <: List{T} +end + +mutable struct Cons{T} <: List{T} + value::T + next::List{T} +end + +Cons{T}(value::T) where T = Cons{T}(value, Nil{T}()) + +function List{T}(pointer, count::Integer) where T + result = Nil{T}() + for i in count:-1:1 + result = Cons{T}(unsafe_load(pointer, i), result) + end + result +end + +function foldl(op, list::List{T}; init) where T + node = list + accumulator = init + while isa(node, Cons{T}) + accumulator = op(accumulator, node.value) + node = node.next + end + accumulator +end + +function reduce(op, list::List{T}; init) where T + foldl(op, list; init=init) +end + +function sum(list::List{T}) where T + reduce(+, list; init=zero(T)) +end + +const element_count = 2000 +const thread_count = 32 + +function upload!(destination, source) + Mem.copy!(destination, pointer(source), sizeof(source)) +end + +function download(::Type{T}, source, dims) where T + result = Array{T}(undef, dims) + Mem.copy!(pointer(result), source, sizeof(result)) + result +end + +function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + l = List{Int64}(elements, element_count) + unsafe_store!(results, sum(l), i) + return +end + +# Allocate two arrays. +source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * element_count) +destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * thread_count) +source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array) +destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + +# Fill the source and destination arrays. +upload!(source_array, Array(1:element_count)) +upload!(destination_array, zeros(Int64, thread_count)) + +# Run the kernel. +if use_gc + @cuda gc=true threads=thread_count gc_config=GCConfiguration(; global_arena_initial_size=1024, global_arena_starvation_threshold=1024) kernel(source_pointer, destination_pointer) + stats = @cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer) +else + @cuda threads=thread_count kernel(source_pointer, destination_pointer) + stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer) +end +println(stats) + +@test download(Int64, destination_array, thread_count) == repeat([sum(1:element_count)], thread_count) diff --git a/examples/lock.jl b/examples/lock.jl new file mode 100644 index 00000000..8f59d100 --- /dev/null +++ b/examples/lock.jl @@ -0,0 +1,46 @@ +using CUDAdrv, CUDAnative +using Test + +const thread_count = Int32(128) +const total_count = Int32(1024) + +# Define a kernel that atomically increments a counter using a lock. +function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.MutexState}) + lock = Mutex(lock_state) + done = false + while !done && try_lock(lock) + new_count = unsafe_load(counter) + 1 + unsafe_store!(counter, new_count) + if new_count == total_count + done = true + end + CUDAnative.unlock(lock) + end + return +end + +function upload!(destination, source) + Mem.copy!(destination, pointer(source), sizeof(source)) +end + +function download(::Type{T}, source, dims) where T + result = Array{T}(undef, dims) + Mem.copy!(pointer(result), source, sizeof(result)) + result +end + +# Allocate memory for the counter and the lock. +counter_buf = Mem.alloc(Mem.DeviceBuffer, sizeof(Int32)) +upload!(counter_buf, [Int32(0)]) +counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf) + +lock_buf = Mem.alloc(Mem.DeviceBuffer, sizeof(CUDAnative.MutexState)) +upload!(lock_buf, [CUDAnative.MutexState(0)]) +lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.MutexState}, lock_buf) + +# Run the kernel. +@cuda threads=thread_count increment_counter(counter_pointer, lock_pointer) + +# Check that the counter's final value equals the number +# of threads. +@test download(Int32, counter_buf) == [Int32(total_count)] diff --git a/examples/matrix.jl b/examples/matrix.jl new file mode 100644 index 00000000..8a607103 --- /dev/null +++ b/examples/matrix.jl @@ -0,0 +1,133 @@ +# This example has kernels allocate dense symmetric matrices, fill them with Fibonacci numbers +# and compute their squares. The example is designed to stress the garbage allocator, specifically +# testing its ability to deal with many large objects. Furthermore, the example requires multiple +# collections to run to completion, so it also tests the performance of those collections. + +using StaticArrays, CUDAnative, CUDAdrv +import Base: getindex, setindex!, pointer, unsafe_convert, zeros + +const use_gc = true + +"""A fixed-size, heap-allocated array type for CUDAnative kernels.""" +struct FixedArray{T} + # The number of elements in the array. + size::Int + + # A pointer to the first element in the array. + # + # TODO: maybe protect this pointer from the GC somehow? + # At the moment, this pointer is protected automatically + # because the GC is conservative rather than precise. + ptr::Ptr{T} +end + +"""Allocates a heap-allocated array type and fills it with zeros.""" +function zeros(::Type{FixedArray{T}}, size::Int) where T + # Note: GC memory is always zero-initialized, so we don't + # actually have to fill the array with zeros. + bytesize = Csize_t(sizeof(T) * size) + buf = use_gc ? gc_malloc(bytesize) : CUDAnative.malloc(bytesize) + FixedArray{T}(size, unsafe_convert(Ptr{T}, buf)) +end + +"""Gets a pointer to the first element of a fixed-size array.""" +function pointer(array::FixedArray{T})::Ptr{T} where T + array.ptr +end + +function getindex(array::FixedArray{T}, i::Integer)::T where T + # TODO: bounds checking. + unsafe_load(pointer(array), i) +end + +function setindex!(array::FixedArray{T}, value::T, i::Integer) where T + # TODO: bounds checking. + unsafe_store!(pointer(array), value, i) +end + +"""A heap-allocated matrix type, suitable for CUDAnative kernels.""" +struct Matrix{Width, Height, T} + data::FixedArray{T} +end + +Matrix{Width, Height, T}() where {Width, Height, T} = + Matrix{Width, Height, T}(zeros(FixedArray{T}, Width * Height)) + +function pointer(matrix::Matrix{Width, Height, T})::Ptr{T} where {Width, Height, T} + pointer(matrix.data) +end + +function getindex(matrix::Matrix{Width, Height, T}, row::Int, column::Int) where {Width, Height, T} + getindex(matrix.data, (row - 1) * Width + column) +end + +function setindex!(matrix::Matrix{Width, Height, T}, value::T, row::Int, column::Int) where {Width, Height, T} + setindex!(matrix.data, value, (row - 1) * Width + column) +end + +const matrix_dim = 50 +const iterations = 20 +const thread_count = 256 + +function kernel(result::CUDAnative.DevicePtr{Int64}) + thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + accumulator = 0 + + for _ in 1:iterations + # Allocate a matrix. + matrix = Matrix{matrix_dim, matrix_dim, Int64}() + + # Fill it with Fibonacci numbers. + penultimate = 0 + ultimate = 1 + for i in 1:matrix_dim + for j in 1:matrix_dim + matrix[i, j] = ultimate + tmp = ultimate + ultimate = ultimate + penultimate + penultimate = tmp + end + end + + # Create a new element that contains the square of + # every element in `matrix`. + square = Matrix{matrix_dim, matrix_dim, Int64}() + for i in 1:matrix_dim + for j in 1:matrix_dim + square[i, j] = matrix[i, j] ^ 2 + end + end + + # Compute the sum of the squares. + square_sum = 0 + for i in 1:matrix_dim + for j in 1:matrix_dim + square_sum += square[i, j] + end + end + + # Add that sum to an accumulator. + accumulator += square_sum + end + + # Write the accumulator to the result array. + unsafe_store!(result, accumulator, thread_id) + + return +end + +destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * thread_count) +destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + +if use_gc + time = @cuda gc=true threads=thread_count kernel(destination_pointer) + println(time) + time = @cuda gc=true threads=thread_count kernel(destination_pointer) + println(time) +else + time = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer) + println(time) + time = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer) + println(time) +end diff --git a/examples/stdlib-array.jl b/examples/stdlib-array.jl new file mode 100644 index 00000000..157a468f --- /dev/null +++ b/examples/stdlib-array.jl @@ -0,0 +1,20 @@ +using CUDAdrv, CUDAnative, StaticArrays + +# This example allocates an array in a GPU kernel. + +const thread_count = 64 + +@noinline function escape(value) + Base.pointer_from_objref(value) + value +end + +function kernel() + array = [1, 2, 3, 4, 5, 6, 7] + escape(array) + comp = [i * i for i in array] + escape(comp) + return +end + +@cuda gc=true threads=thread_count kernel() diff --git a/gc-benchmarks/array-expansion.jl b/gc-benchmarks/array-expansion.jl new file mode 100644 index 00000000..f7b43075 --- /dev/null +++ b/gc-benchmarks/array-expansion.jl @@ -0,0 +1,46 @@ +module ArrayExpansion + +using CUDAdrv, CUDAnative + +# This benchmark has every thread create arrays and repeatedly +# append elements to those arrays. + +const thread_count = 256 +const array_length = 200 +const runs = 5 + +function iterative_sum(elements::Array{T})::T where T + result = zero(T) + for i in elements + result += i + end + return result +end + +function kernel(destination) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + result = 0 + for j in 1:runs + array = Int[] + for k in 1:array_length + push!(array, k) + end + result += iterative_sum(array) + end + unsafe_store!(destination, result, i) + return +end + +end + +function array_expansion_benchmark() + destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int) * ArrayExpansion.thread_count) + destination_pointer = Base.unsafe_convert(CuPtr{Int}, destination_array) + + # Run the kernel. + @cuda_sync threads=ArrayExpansion.thread_count ArrayExpansion.kernel(destination_pointer) + + @test download(Int, destination_array, ArrayExpansion.thread_count) == fill(ArrayExpansion.runs * sum(1:ArrayExpansion.array_length), ArrayExpansion.thread_count) +end + +@cuda_benchmark "array expansion" array_expansion_benchmark() diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl new file mode 100644 index 00000000..045d52bc --- /dev/null +++ b/gc-benchmarks/array-features.jl @@ -0,0 +1,112 @@ +module ArrayFeatures + +using CUDAdrv, CUDAnative + +# This benchmark has every thread exercise the core low-level +# array API. + +const thread_count = 256 + +# Creates an array of Fibonacci numbers. +function fib_array(count::Integer) + # Calls `jl_alloc_array_1d`. + result = [1, 1] + # Calls `jl_array_sizehint`. + sizehint!(result, count + 2) + for i in 1:count + # Calls `jl_array_grow_end`. + push!(result, result[i] + result[i + 1]) + end + return result +end + +function intersperse_with!(vec::Vector{T}, value::T) where T + for i in 1:length(vec) + # Calls `jl_array_grow_at`. + insert!(vec, i * 2, value) + end + return vec +end + +function iterative_sum(array) + result = 0 + for i in array + result += i + end + return result +end + +function manipulate_array() + # Initialize the array as a Fibonacci sequence. + arr = fib_array(20) + + # Intersperse the array with constants. + intersperse_with!(arr, 2) + + # Prepend a constant to the array (calls `jl_array_grow_beg`). + pushfirst!(arr, 2) + + # Intersperse again. + intersperse_with!(arr, 4) + + # Delete the first element (calls `jl_array_del_beg`). + popfirst!(arr) + + # Delete the last element (calls `jl_array_del_end`). + pop!(arr) + + # Delete some other element (calls `jl_array_del_at`). + deleteat!(arr, 8) + + # Create a two-dimensional array (calls `jl_alloc_array_2d`). + arr_2d = fill(2, (2, 2)) + + # Create a three-dimensional array (calls `jl_alloc_array_3d`). + arr_3d = fill(2, (2, 2, 2)) + + # Create a four-dimensional array (calls `jl_new_array`). + arr_4d = fill(2, (2, 2, 2, 2)) + + # Create an alias for the Fibonacci array (this is dangerous, but we + # know what we're doing here; calls `jl_ptr_to_array_1d`). + alias = unsafe_wrap(Array, pointer(arr), length(arr)) + + # Create an alias for `arr_2d` (calls `jl_ptr_to_array`). + alias_2d = unsafe_wrap(Array, pointer(arr_2d), size(arr_2d)) + + # Create an array that is similar to `arr_3d` and fill it with constants. + # This does not call any new low-level functions, but it does illustrate + # that high-level functions such as `similar` and `fill!` fully functional. + arr_3d_sim = similar(arr_3d) + fill!(arr_3d_sim, 10) + + return iterative_sum(arr) + + iterative_sum(arr_2d) + + iterative_sum(arr_3d) + + iterative_sum(arr_4d) + + iterative_sum(alias) + + iterative_sum(alias_2d) + + iterative_sum(arr_3d_sim) +end + +function kernel(destination) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + for j in 1:2 + unsafe_store!(destination, manipulate_array(), i) + end + return +end + +end + +function array_features_benchmark() + destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int) * ArrayFeatures.thread_count) + destination_pointer = Base.unsafe_convert(CuPtr{Int}, destination_array) + + # Run the kernel. + @cuda_sync threads=ArrayFeatures.thread_count ArrayFeatures.kernel(destination_pointer) + + @test download(Int, destination_array, ArrayFeatures.thread_count) == fill(ArrayFeatures.manipulate_array(), ArrayFeatures.thread_count) +end + +@cuda_benchmark "array features" array_features_benchmark() diff --git a/gc-benchmarks/array-reduction.jl b/gc-benchmarks/array-reduction.jl new file mode 100644 index 00000000..b4747de3 --- /dev/null +++ b/gc-benchmarks/array-reduction.jl @@ -0,0 +1,43 @@ +module ArrayReduction + +using CUDAdrv, CUDAnative + +# This benchmark approximates pi by naively constructing an array comprehension +# for the Madhava–Leibniz series and computing its sum. It does this a few times +# to achieve a respectable run time. + +const thread_count = 256 +const series_length = 200 +const runs = 20 + +function iterative_sum(elements::Array{T})::T where T + result = zero(T) + for i in elements + result += i + end + return result +end + +function kernel(destination) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + unsafe_store!(destination, 0.0, i) + for _ in 1:runs + series = [CUDAnative.pow(-1 / 3.0, Float64(k)) / (2.0 * k + 1.0) for k in 0:series_length] + unsafe_store!(destination, unsafe_load(destination, i) + CUDAnative.sqrt(12.0) * iterative_sum(series), i) + end + return +end + +end + +function array_reduction_benchmark() + destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float64) * ArrayReduction.thread_count) + destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array) + + # Run the kernel. + @cuda_sync threads=ArrayReduction.thread_count ArrayReduction.kernel(destination_pointer) + + @test download(Float64, destination_array, ArrayReduction.thread_count) ≈ ArrayReduction.runs .* fill(Float64(pi), ArrayReduction.thread_count) +end + +@cuda_benchmark "array reduction" array_reduction_benchmark() diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl new file mode 100644 index 00000000..1f247f6c --- /dev/null +++ b/gc-benchmarks/arrays.jl @@ -0,0 +1,48 @@ +module Arrays + +using CUDAdrv, CUDAnative +import ..CUDArandom: LinearCongruentialGenerator, next + +# This benchmark allocates a hierarchy of fairly modest Julia arrays. +# Some arrays remain alive, others become unreachable. This benchmark +# seeks to ascertain the performance of the allocator and garbage collector. + +const thread_count = 64 +const insertion_count = 80 + +function insert(target::Array{Any, 1}, generator::LinearCongruentialGenerator) + while true + index = next(generator, 1, length(target)) + elem = target[index] + if isa(elem, Array{Any, 1}) + if length(elem) > 0 + if next(generator, 0, 2) == 0 + target = elem + continue + end + end + end + + target[index] = Any[Any[] for _ in 1:5] + return + end +end + +function kernel() + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + generator = LinearCongruentialGenerator(i) + toplevel = Any[Any[] for _ in 1:10] + for i in 1:insertion_count + insert(toplevel, generator) + end + return +end + +end + +function arrays_benchmark() + # Run the kernel. + @cuda_sync threads=Arrays.thread_count Arrays.kernel() +end + +@cuda_benchmark "arrays" arrays_benchmark() diff --git a/gc-benchmarks/binary-tree.jl b/gc-benchmarks/binary-tree.jl new file mode 100644 index 00000000..8341bb45 --- /dev/null +++ b/gc-benchmarks/binary-tree.jl @@ -0,0 +1,168 @@ +using Random, Test + +module BinaryTree + +using CUDAdrv, CUDAnative +import Base: haskey, insert! + +# This benchmark defines a kernel that constructs a binary search +# tree for a set of numbers and then proceeds to test membership +# in that tree for a sequence of other numbers. +# +# The benchmark is designed to stress the allocator's ability to +# allocate many small objects and garbage-collect the ones that +# become dead after a while. + +"""A binary search tree node.""" +abstract type BinarySearchTreeNode{T} end + +"""An internal node of a binary search tree.""" +mutable struct InternalNode{T} <: BinarySearchTreeNode{T} + value::T + left::BinarySearchTreeNode{T} + right::BinarySearchTreeNode{T} +end + +InternalNode{T}(value::T) where T = InternalNode{T}(value, LeafNode{T}(), LeafNode{T}()) + +"""A leaf node of a binary search tree.""" +mutable struct LeafNode{T} <: BinarySearchTreeNode{T} end + +"""A binary search tree data structure.""" +mutable struct BinarySearchTree{T} + root::BinarySearchTreeNode{T} +end + +"""Creates an empty binary search tree.""" +BinarySearchTree{T}() where T = BinarySearchTree{T}(LeafNode{T}()) + +"""Tells if a binary search tree contains a particular element.""" +function haskey(tree::BinarySearchTree{T}, value::T)::Bool where T + walk = tree.root + while isa(walk, InternalNode{T}) + if walk.value == value + return true + elseif walk.value > value + walk = walk.right + else + walk = walk.left + end + end + return false +end + +"""Inserts an element into a binary search tree.""" +function insert!(tree::BinarySearchTree{T}, value::T) where T + if !isa(tree.root, InternalNode{T}) + tree.root = InternalNode{T}(value) + return + end + + walk = tree.root::InternalNode{T} + while true + if walk.value == value + return + elseif walk.value > value + right = walk.right + if isa(right, InternalNode{T}) + walk = right + else + walk.right = InternalNode{T}(value) + return + end + else + left = walk.left + if isa(left, InternalNode{T}) + walk = left + else + walk.left = InternalNode{T}(value) + return + end + end + end +end + +""" +Creates a binary search tree that contains elements copied from a device array. +""" +function BinarySearchTree{T}(elements::CUDAnative.DevicePtr{T}, size::Integer) where T + tree = BinarySearchTree{T}() + for i in 1:size + insert!(tree, unsafe_load(elements, i)) + end + tree +end + +""" +Creates a binary search tree that contains elements copied from an array. +""" +function BinarySearchTree{T}(elements::Array{T}) where T + tree = BinarySearchTree{T}() + for i in 1:length(elements) + insert!(tree, elements[i]) + end + tree +end + +# Gets a sequence of Fibonacci numbers. +function fibonacci(::Type{T}, count::Integer)::Array{T} where T + if count == 0 + return [] + elseif count == 1 + return [one(T)] + end + + results = [one(T), one(T)] + for i in 1:(count - 2) + push!(results, results[length(results) - 1] + results[length(results)]) + end + return results +end + +const number_count = 200 +const thread_count = 64 +const tests_per_thread = 2000 + +# Define a kernel that copies values using a temporary buffer. +function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + tree = BinarySearchTree{Int64}(a, number_count) + + for j in 1:tests_per_thread + offset = (i - 1) * tests_per_thread + index = offset + j + unsafe_store!(b, haskey(tree, unsafe_load(b, index)), index) + end + + return +end + +end + +function bintree_benchmark() + # Generate a sequence of 64-bit truncated Fibonacci numbers. + number_set = BinaryTree.fibonacci(Int64, BinaryTree.number_count) + # Randomize the sequence's order. + shuffle!(number_set) + + # Generate numbers for which we will test membership in the sequence. + test_sequence = Array(1:(BinaryTree.thread_count * BinaryTree.tests_per_thread)) + + # Allocate two arrays. + source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * length(number_set)) + destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * length(test_sequence)) + source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array) + destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + + # Fill the source and destination arrays. + upload!(source_array, number_set) + upload!(destination_array, test_sequence) + + # Run the kernel. + @cuda_sync threads=BinaryTree.thread_count BinaryTree.kernel(source_pointer, destination_pointer) + + @test download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence]) +end + +@cuda_benchmark "binary tree" bintree_benchmark() diff --git a/gc-benchmarks/bitvector.jl b/gc-benchmarks/bitvector.jl new file mode 100644 index 00000000..59892e92 --- /dev/null +++ b/gc-benchmarks/bitvector.jl @@ -0,0 +1,101 @@ +module Bitvector + +import Base: +, *, << +using CUDAnative + +# This benchmark performs naive arithmetic on bitvectors. +# The goal of the benchmark is to gauge how GPU-unaware +# standard library code that depends on arrays behaves when +# used in a GPU kernel. + +const thread_count = 256 + +@noinline function escape(value) + Base.pointer_from_objref(value) + value +end + +mutable struct BitInteger{N} + bits::BitVector +end + +function zero(::Type{BitInteger{N}})::BitInteger{N} where N + BitInteger{N}(falses(N)) +end + +function one(::Type{BitInteger{N}})::BitInteger{N} where N + result = falses(N) + result[1] = true + return BitInteger{N}(result) +end + +function +(a::BitInteger{N}, b::BitInteger{N})::BitInteger{N} where N + carry = false + c = falses(N) + for i in 1:N + s = Int(a.bits[i]) + Int(b.bits[i]) + Int(carry) + if s == 1 + carry = false + c[i] = true + elseif s == 2 + carry = true + elseif s == 3 + carry = true + c[i] = true + end + end + return BitInteger{N}(c) +end + +function <<(a::BitInteger{N}, amount::Integer)::BitInteger{N} where N + c = falses(N) + for i in 1:(N - amount) + c[i + amount] = a.bits[i] + end + return BitInteger{N}(c) +end + +function *(a::BitInteger{N}, b::BitInteger{N})::BitInteger{N} where N + c = zero(BitInteger{N}) + for i in 1:N + if a.bits[i] + c += (b << (i - 1)) + end + end + return c +end + +function factorial(::Type{BitInteger{N}}, value::Integer)::BitInteger{N} where N + accumulator = one(BitInteger{N}) + iv = one(BitInteger{N}) + for i in 1:value + accumulator *= iv + iv += one(BitInteger{N}) + end + return accumulator +end + +function to_int(value::BitInteger{N})::Int where N + result = 0 + for i in 1:N + if value.bits[i] + result += (1 << (i - 1)) + end + end + return result +end + +function kernel() + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + factorial(BitInteger{128}, 10) + return +end + +end + +function bitvector_benchmark() + # Run the kernel. + @cuda_sync threads=Bitvector.thread_count Bitvector.kernel() +end + +@cuda_benchmark "bitvector" bitvector_benchmark() diff --git a/gc-benchmarks/genetic-algorithm.jl b/gc-benchmarks/genetic-algorithm.jl new file mode 100644 index 00000000..06a83f74 --- /dev/null +++ b/gc-benchmarks/genetic-algorithm.jl @@ -0,0 +1,158 @@ +module GeneticAlgorithm + +# This benchmark runs a genetic algorithm on the GPU. +# The population is stored in linked lists and characters +# are stored in heap memory. + +using CUDAnative, CUDAdrv +import ..LinkedList: List, Nil, Cons, foldl, map, max +import ..CUDArandom: LinearCongruentialGenerator, next + +# A character in our genetic algorithm, based loosely on Fallout's SPECIAL system. +mutable struct Character + strength::Int + perception::Int + endurance::Int + charisma::Int + intelligence::Int + agility::Int + luck::Int +end + +# Computes the mean of two integers. +function mean(a::Int, b::Int)::Int + div(a + b, 2) +end + +function crossover(parent_one::Character, parent_two::Character)::Character + Character( + mean(parent_one.strength, parent_two.strength), + mean(parent_one.perception, parent_two.perception), + mean(parent_one.endurance, parent_two.endurance), + mean(parent_one.charisma, parent_two.charisma), + mean(parent_one.intelligence, parent_two.intelligence), + mean(parent_one.agility, parent_two.agility), + mean(parent_one.luck, parent_two.luck)) +end + +function mutate_stat(value::Int, generator::LinearCongruentialGenerator)::Int + new_stat = value + next(generator, -2, 3) + if new_stat > 10 + return 10 + elseif new_stat < 0 + return 0 + else + return new_stat + end +end + +function mutate(original::Character, generator::LinearCongruentialGenerator)::Character + Character( + mutate_stat(original.strength, generator), + mutate_stat(original.perception, generator), + mutate_stat(original.endurance, generator), + mutate_stat(original.charisma, generator), + mutate_stat(original.intelligence, generator), + mutate_stat(original.agility, generator), + mutate_stat(original.luck, generator)) +end + +function random_character(generator::LinearCongruentialGenerator)::Character + Character( + next(generator, 0, 11), + next(generator, 0, 11), + next(generator, 0, 11), + next(generator, 0, 11), + next(generator, 0, 11), + next(generator, 0, 11), + next(generator, 0, 11)) +end + +# Computes the fitness of a character. +function fitness(individual::Character)::Float64 + # Compute the character's cost, i.e., the sum of their stats. + cost = Float64(individual.strength + + individual.perception + + individual.endurance + + individual.charisma + + individual.intelligence + + individual.agility + + individual.luck) + + # Compute the character's true fitness, i.e., how well we expect + # the character to perform. + true_fitness = 0.0 + + function stat_fitness(stat::Int)::Float64 + if stat >= 5 + # Linear returns for stats greater than five. + return Float64(stat) + else + # Very low stats make for a poor character build. + return Float64(stat * stat) / 25.0 + end + end + + # Evaluate stats. + true_fitness += stat_fitness(individual.strength) + true_fitness += stat_fitness(individual.perception) + true_fitness += stat_fitness(individual.endurance) + true_fitness += stat_fitness(individual.charisma) + true_fitness += stat_fitness(individual.intelligence) + true_fitness += stat_fitness(individual.agility) + true_fitness += stat_fitness(individual.luck) + + # We like charisma, intelligence and luck. + true_fitness += Float64(individual.charisma) + true_fitness += Float64(individual.intelligence) + true_fitness += Float64(individual.luck) + + true_fitness - cost + 100.0 +end + +function fittest(population::List{Character})::Character + max(fitness, population, Character(0, 0, 0, 0, 0, 0, 0)) +end + +function step(population::List{Character}, generator::LinearCongruentialGenerator)::List{Character} + # Find the fittest individual in the population. + best = fittest(population) + # Do a bunch of crossovers and mutate the resulting population. + map(x -> mutate(crossover(best, x), generator), population) +end + +function genetic_algo(seed::Int)::Character + generator = LinearCongruentialGenerator(seed) + + # Generate some random characters. + individuals = Nil{Character}() + for j in 1:10 + individuals = Cons{Character}(random_character(generator), individuals) + end + + # Run the genetic algorithm for a few iterations. + for j in 1:2 + individuals = step(individuals, generator) + end + + # Find the best individual in the population. + fittest(individuals) +end + +const thread_count = 256 + +function kernel(results::CUDAnative.DevicePtr{Float64}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + fittest_individual = genetic_algo(i) + unsafe_store!(results, fitness(fittest_individual), i) +end + +end + +function genetic_benchmark() + destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float64) * GeneticAlgorithm.thread_count) + destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array) + @cuda_sync threads=GeneticAlgorithm.thread_count GeneticAlgorithm.kernel(destination_pointer) +end + +@cuda_benchmark "genetic algo" genetic_benchmark() diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl new file mode 100644 index 00000000..5bc9b8ec --- /dev/null +++ b/gc-benchmarks/linked-list.jl @@ -0,0 +1,119 @@ +module LinkedList + +using CUDAnative, CUDAdrv +import Base: foldl, reduce, sum, max, map, reverse, filter + +# This benchmark constructs a linked list in a GPU kernel. +# In doing so, it stresses the allocator's ability to quickly +# allocate many small objects, as is common in idiomatic +# object-oriented programs. +# Thread divergence should be minimal in this benchmark. + +abstract type List{T} +end + +mutable struct Nil{T} <: List{T} +end + +mutable struct Cons{T} <: List{T} + value::T + next::List{T} +end + +Cons{T}(value::T) where T = Cons{T}(value, Nil{T}()) + +function List{T}(pointer, count::Integer) where T + result = Nil{T}() + for i in count:-1:1 + result = Cons{T}(unsafe_load(pointer, i), result) + end + result +end + +function foldl(op, list::List{T}; init) where T + node = list + accumulator = init + while isa(node, Cons{T}) + accumulator = op(accumulator, node.value) + node = node.next + end + accumulator +end + +function reduce(op, list::List{T}; init) where T + foldl(op, list; init=init) +end + +function sum(list::List{T}) where T + reduce(+, list; init=zero(T)) +end + +function map_reverse(f::Function, list::List{T})::List{T} where T + foldl(list; init=Nil{T}()) do accumulator, value + Cons{T}(f(value), accumulator) + end +end + +function reverse(list::List{T})::List{T} where T + map_reverse(x -> x, list) +end + +function map(f::Function, list::List{T})::List{T} where T + reverse(map_reverse(f, list)) +end + +function max(evaluate::Function, list::List{T}, default_value::T)::T where T + foldl(list; init=default_value) do max_elem, elem + if evaluate(max_elem) < evaluate(elem) + elem + else + max_elem + end + end +end + +function filter_reverse(f::Function, list::List{T})::List{T} where T + foldl(list; init=Nil{T}()) do accumulator, value + if f(value) + Cons{T}(value, accumulator) + else + accumulator + end + end +end + +function filter(f::Function, list::List{T})::List{T} where T + reverse(filter_reverse(f, list)) +end + +const element_count = 1000 +const thread_count = 32 + +function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + l = List{Int64}(elements, element_count) + unsafe_store!(results, sum(l), i) + return +end + +end + +function linkedlist_benchmark() + # Allocate two arrays. + source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * LinkedList.element_count) + destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * LinkedList.thread_count) + source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array) + destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + + # Fill the source and destination arrays. + upload!(source_array, Array(1:LinkedList.element_count)) + upload!(destination_array, zeros(Int64, LinkedList.thread_count)) + + # Run the kernel. + @cuda_sync threads=LinkedList.thread_count LinkedList.kernel(source_pointer, destination_pointer) + + # Verify the kernel's output. + @test download(Int64, destination_array, LinkedList.thread_count) == repeat([sum(1:LinkedList.element_count)], LinkedList.thread_count) +end + +@cuda_benchmark "linked list" linkedlist_benchmark() diff --git a/gc-benchmarks/matrix.jl b/gc-benchmarks/matrix.jl new file mode 100644 index 00000000..5cb1cb57 --- /dev/null +++ b/gc-benchmarks/matrix.jl @@ -0,0 +1,45 @@ +module Matrix + +using StaticArrays, CUDAnative, CUDAdrv + +# This benchmark makes every thread allocate a large matrix. +# It stresses the allocator's ability to quickly allocate +# very large objects. + +const matrix_dim = 40 +const thread_count = 256 + +@noinline function escape(value) + Base.pointer_from_objref(value) + value +end + +function fill() + m = zeros(MMatrix{matrix_dim, matrix_dim, Int64}) + + for i in 1:matrix_dim + for j in 1:matrix_dim + m[i, j] = i * j + end + end + + return escape(m) +end + +function kernel(result::CUDAnative.DevicePtr{Int64}) + thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x + for i in 1:6 + unsafe_store!(result, fill()[20, 30], thread_id) + end + return +end + +end + +function matrix_benchmark() + destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * Matrix.thread_count) + destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + @cuda_sync threads=Matrix.thread_count Matrix.kernel(destination_pointer) +end + +@cuda_benchmark "matrix" matrix_benchmark() diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl new file mode 100644 index 00000000..359d80bc --- /dev/null +++ b/gc-benchmarks/run-all.jl @@ -0,0 +1,113 @@ +using CUDAdrv, CUDAnative, Test, Statistics + +include("utils.jl") + +include("array-expansion.jl") +include("array-features.jl") +include("array-reduction.jl") +include("arrays.jl") +include("binary-tree.jl") +include("bitvector.jl") +include("linked-list.jl") +include("matrix.jl") +include("ssa-opt.jl") +include("static-arrays.jl") +include("stream-queries.jl") +include("genetic-algorithm.jl") + +results = run_benchmarks() +# Print the results to the terminal. +println(results) + +gc_tags = [t for t in benchmark_tags if startswith(t, "gc")] + +# Also write them to a CSV for further analysis. +open("strategies.csv", "w") do file + write(file, "benchmark,nogc,gc,gc-shared,bump,bump-pinned,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio,bump-pinned-ratio\n") + all_results = [] + function write_line(key, results) + if length(all_results) == 0 + all_results = [Float64[] for _ in results] + end + write(file, "$key,$(join(results, ','))\n") + for (l, val) in zip(all_results, results) + push!(l, val) + end + end + + for key in sort(collect(keys(results))) + runs = results[key] + gc_time = runs["gc"] / 1e6 + gc_shared_time = runs["gc-shared"] / 1e6 + nogc_time = runs["nogc"] / 1e6 + bump_time = runs["bump"] / 1e6 + bump_pinned_time = runs["bump-pinned"] / 1e6 + gc_ratio = gc_time / nogc_time + gc_shared_ratio = gc_shared_time / nogc_time + bump_ratio = bump_time / nogc_time + bump_pinned_ratio = bump_pinned_time / nogc_time + write_line(key, [nogc_time, gc_time, gc_shared_time, bump_time, bump_pinned_time, 1.0, gc_ratio, gc_shared_ratio, bump_ratio, bump_pinned_ratio]) + end + write_line("mean", mean.(all_results)) +end + +open("gc-heap-sizes.csv", "w") do file + ratio_tags = [t * "-ratio" for t in gc_tags] + write(file, "benchmark,$(join(gc_tags, ',')),$(join(ratio_tags, ','))\n") + all_times = [[] for t in gc_tags] + all_normalized_times = [[] for t in gc_tags] + for key in sort(collect(keys(results))) + runs = results[key] + times = [runs[t] / 1e6 for t in gc_tags] + for (l, val) in zip(all_times, times) + push!(l, val) + end + normalized_times = [runs[t] / runs["gc"] for t in gc_tags] + for (l, val) in zip(all_normalized_times, normalized_times) + push!(l, val) + end + write(file, "$key,$(join(times, ',')),$(join(normalized_times, ','))\n") + end + write(file, "mean,$(join(map(mean, all_times), ',')),$(join(map(mean, all_normalized_times), ','))\n") +end + +open("gc-heap-sizes-summary.csv", "w") do file + write(file, "heap,mean-opt,mean-shared\n") + shared = Dict() + sizes = Dict() + for tag in gc_tags + shared[tag] = false + sizes[tag] = 60.0 + for part in split(tag, "-") + if endswith(part, "mb") + sizes[tag] = parse(Float64, part[1:end - 2]) + elseif part == "shared" + shared[tag] = true + end + end + end + + all_normalized_times = [[] for t in gc_tags] + for key in sort(collect(keys(results))) + runs = results[key] + normalized_times = [runs[t] / runs["gc"] for t in gc_tags] + for (l, val) in zip(all_normalized_times, normalized_times) + push!(l, val) + end + end + + unique_sizes = sort(unique(values(sizes))) + data = zeros(Float64, (2, length(unique_sizes))) + for (tag, vals) in zip(gc_tags, all_normalized_times) + if shared[tag] + shared_index = 2 + else + shared_index = 1 + end + size_index = indexin(sizes[tag], unique_sizes)[1] + data[shared_index, size_index] = mean(vals) + end + for i in 1:length(unique_sizes) + write(file, "$(unique_sizes[i]),$(data[1, i]),$(data[2, i])\n") + end +end diff --git a/gc-benchmarks/run-breakdown.jl b/gc-benchmarks/run-breakdown.jl new file mode 100644 index 00000000..1d1bd5b9 --- /dev/null +++ b/gc-benchmarks/run-breakdown.jl @@ -0,0 +1,108 @@ +using CUDAdrv, CUDAnative, Test, Statistics, JSON + +include("utils-common.jl") + +const benchmarks = Dict() +global benchmark_results = Dict() +global current_benchmark = nothing + +macro cuda_sync(args...) + esc(quote + local heap_size = 10 * MiB + local local_arena_initial_size = div(heap_size, 10) + local global_arena_initial_size = heap_size - 8 * local_arena_initial_size + local gc_config = GCConfiguration( + local_arena_count=8, + local_arena_initial_size=local_arena_initial_size, + global_arena_initial_size=global_arena_initial_size) + local result = CUDAnative.@cuda gc=true gc_config=gc_config $(args...) + push!(benchmark_results[current_benchmark], result) + end) +end + +macro cuda_benchmark(name, ex) + esc(quote + benchmarks[$name] = (() -> $(ex)) + end) +end + +include("array-expansion.jl") +include("array-features.jl") +include("array-reduction.jl") +include("arrays.jl") +include("binary-tree.jl") +include("bitvector.jl") +include("linked-list.jl") +include("matrix.jl") +include("ssa-opt.jl") +include("static-arrays.jl") +include("stream-queries.jl") +include("genetic-algorithm.jl") + +function run_benchmarks() + cache_dir = mkpath("gc-benchmarks/breakdown-cache") + global benchmark_results = Dict() + results = Dict() + for (k, v) in pairs(benchmarks) + println(k) + cache_path = "$cache_dir/$(replace(k, " " => "-")).json" + if isfile(cache_path) + results[k] = open(cache_path, "r") do file + JSON.parse(file) + end + else + # Perform a dry run to ensure that compilations are cached. + global current_benchmark = k + benchmark_results[k] = [] + v() + + # Run the benchmarks for real. + benchmark_results[k] = [] + v() + while sum(map(x -> x.elapsed_time, benchmark_results[k])) < 90 + v() + end + + results[k] = [ + Dict( + "elapsed-time" => r.elapsed_time, + "collection-count" => r.collection_count, + "collection-poll-time" => r.collection_poll_time, + "collection-time" => r.collection_time) + for (k, r) in pairs(benchmark_results[k])] + + open(cache_path, "w") do file + JSON.print(file, results[k]) + end + end + end + return results +end + +results = run_benchmarks() +# Write results to a CSV file for further analysis. +open("breakdown.csv", "w") do file + write(file, "benchmark,collection-poll-ratio,collection-ratio,other-ratio\n") + all_results = [] + function write_line(key, results) + if length(all_results) == 0 + all_results = [Float64[] for _ in results] + end + write(file, "$key,$(join(results, ','))\n") + for (l, val) in zip(all_results, results) + push!(l, val) + end + end + + for key in sort(collect(keys(results))) + runs = results[key] + total_time = mean(getindex.(runs, "elapsed-time")) + poll_time = mean(getindex.(runs, "collection-poll-time")) + collection_time = mean(getindex.(runs, "collection-time")) + poll_ratio = poll_time / total_time + collection_ratio = collection_time / total_time + other_ratio = 1.0 - poll_ratio - collection_ratio + write_line(key, [poll_time, collection_ratio, other_ratio]) + end + write_line("mean", mean.(all_results)) +end diff --git a/gc-benchmarks/ssa-opt.jl b/gc-benchmarks/ssa-opt.jl new file mode 100644 index 00000000..a9a83acd --- /dev/null +++ b/gc-benchmarks/ssa-opt.jl @@ -0,0 +1,100 @@ +# This benchmark defines a simple SSA IR, creates a basic +# block on the GPU and applies the constant folding optimization +# to it. + +module SSAOpt + +# A base type for SSA instructions. +abstract type Instruction end + +# A base type for values or flow in an SSA basic block. +abstract type ValueOrFlow end + +# A value in an SSA control-flow graph. +mutable struct Value <: ValueOrFlow + # The instruction that computes the value. + instruction::Instruction + + # The next value or control-flow instruction. + next::ValueOrFlow +end + +# A base type for control-flow instructions in an SSA basic block. +abstract type Flow <: ValueOrFlow end + +# A control-flow instruction that returns a value. +mutable struct ReturnFlow <: Flow + # The value to return. + result::Value +end + +# A control-flow instruction that represents undefined control flow. +mutable struct UndefinedFlow <: Flow end + +# A basic block in an SSA control-flow graph. +mutable struct BasicBlock + # The first value or flow instruction in the basic block. + head::ValueOrFlow +end + +# An integer constant instruction. +mutable struct IConst <: Instruction + value::Int +end + +# An integer addition instruction. +mutable struct IAdd <: Instruction + # The left value. + left::Value + # The right value. + right::Value +end + +# Folds constants in a basic block. +function fold_constants(block::BasicBlock) + value = block.head + while isa(value, Value) + insn = value.instruction + if isa(insn, IAdd) + left = insn.left.instruction + right = insn.right.instruction + if isa(left, IConst) + if isa(right, IConst) + value.instruction = IConst(left.value + right.value) + end + end + end + value = value.next + end + block +end + +# Creates a block that naively computes `sum(1:range_max)`. +function create_range_sum_block(range_max) + head = accumulator = Value(IConst(0), UndefinedFlow()) + for i in 1:range_max + constant = Value(IConst(i), UndefinedFlow()) + accumulator.next = constant + accumulator = Value(IAdd(accumulator, constant), UndefinedFlow()) + constant.next = accumulator + end + ret_flow = ReturnFlow(accumulator) + accumulator.next = ret_flow + BasicBlock(head) +end + +const thread_count = 256 + +function kernel() + block = create_range_sum_block(25) + fold_constants(block) + return +end + +end + +function ssaopt_benchmark() + @cuda_sync threads=SSAOpt.thread_count SSAOpt.kernel() +end + +@cuda_benchmark "ssa opt" ssaopt_benchmark() diff --git a/gc-benchmarks/static-arrays.jl b/gc-benchmarks/static-arrays.jl new file mode 100644 index 00000000..88fcfa43 --- /dev/null +++ b/gc-benchmarks/static-arrays.jl @@ -0,0 +1,53 @@ +module StaticArraysBench + +using CUDAdrv, CUDAnative, StaticArrays + +# This benchmark allocates a variety of differently-sized static arrays. +# The point of this benchmark is to ascertain how well the GC handles +# many differently-sized objects. + +const thread_count = 64 + +@noinline function escape(value) + Base.pointer_from_objref(value) + value +end + +macro new_array(T, size) + quote + escape(zeros(MArray{Tuple{$size}, $T})) + end +end + +function kernel() + for i in 1:2 + for j in 1:2 + for k in 1:2 + for l in 1:2 + @new_array(Int64, 4) + @new_array(Int64, 8) + @new_array(Int64, 16) + end + @new_array(Int64, 32) + @new_array(Int64, 64) + @new_array(Int64, 128) + end + @new_array(Int64, 256) + @new_array(Int64, 512) + @new_array(Int64, 1024) + end + @new_array(Int64, 2048) + @new_array(Int64, 4096) + @new_array(Int64, 8192) + end + return +end + +end + +function static_arrays_benchmark() + # Run the kernel. + @cuda_sync threads=StaticArraysBench.thread_count StaticArraysBench.kernel() +end + +@cuda_benchmark "static arrays" static_arrays_benchmark() diff --git a/gc-benchmarks/stream-queries.jl b/gc-benchmarks/stream-queries.jl new file mode 100644 index 00000000..e7d60953 --- /dev/null +++ b/gc-benchmarks/stream-queries.jl @@ -0,0 +1,31 @@ +module StreamQueries + +using CUDAnative, CUDAdrv +import ..LinkedList: List, Nil, Cons, foldl, map, max, filter + +# This benchmark applies stream operators (map, max,filter) to purely +# functional lists. + +const thread_count = 256 +const input_size = 100 + +function kernel(input::CUDAnative.DevicePtr{Float64}, output::CUDAnative.DevicePtr{Float64}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + values = List{Float64}(input, input_size) + values = map(x -> x * x, values) + values = filter(x -> x < 10.0 && x >= 0.0, values) + unsafe_store!(output, max(x -> x, values, 0.0), i) +end + +end + +function stream_benchmark() + source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float64) * StreamQueries.input_size) + upload!(source_array, rand(Float64, StreamQueries.input_size)) + destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float64) * StreamQueries.thread_count) + source_pointer = Base.unsafe_convert(CuPtr{Float64}, source_array) + destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array) + @cuda_sync threads=StreamQueries.thread_count StreamQueries.kernel(source_pointer, destination_pointer) +end + +@cuda_benchmark "stream queries" stream_benchmark() diff --git a/gc-benchmarks/utils-common.jl b/gc-benchmarks/utils-common.jl new file mode 100644 index 00000000..334ae3c3 --- /dev/null +++ b/gc-benchmarks/utils-common.jl @@ -0,0 +1,66 @@ +module CUDArandom + +# A linear congruential pseudo-random number generator. +mutable struct LinearCongruentialGenerator + modulus::Int + a::Int + c::Int + state::Int +end + +LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed) + +# Requests a pseudo-random number. +function next(generator::LinearCongruentialGenerator)::Int + generator.state = (generator.a * generator.state + generator.c) % generator.modulus + generator.state +end + +# Requests a pseudo-random number that is at least as great as `lower` +# and less than `upper`. +function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int + lower + next(generator) % (upper - lower) +end + +end + +function upload!(destination, source) + Mem.copy!(destination, pointer(source), sizeof(source)) +end + +function download(::Type{T}, source, dims) where T + result = Array{T}(undef, dims) + Mem.copy!(pointer(result), source, sizeof(result)) + result +end + +const MiB = 1 << 20 +const CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 +const BENCHMARK_HEAP_SIZE = 64 * MiB + +function set_malloc_heap_size(size::Integer) + CUDAdrv.@apicall( + :cuCtxSetLimit, + (Cint, Csize_t), + CU_LIMIT_MALLOC_HEAP_SIZE, + Csize_t(size)) +end + +""" + @sync ex +Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly +synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As +such, this operation is preferred over implicit synchronization (e.g. when performing a +memory copy) for high-performance applications. +It is also useful for timing code that executes asynchronously. +""" +macro sync(ex) + # Copied from https://github.com/JuliaGPU/CuArrays.jl/blob/8e45a27f2b12796f47683340845f98f017865676/src/utils.jl#L68-L86 + quote + local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING) + local ret = $(esc(ex)) + CUDAdrv.record(e) + CUDAdrv.synchronize(e) + ret + end +end diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl new file mode 100644 index 00000000..4fe2b540 --- /dev/null +++ b/gc-benchmarks/utils.jl @@ -0,0 +1,121 @@ +import BenchmarkTools, JSON + +include("utils-common.jl") + +function get_gc_mode() + try + return gc_mode + catch ex + return "gc" + end +end + +macro cuda_sync(args...) + esc(quote + local mode = get_gc_mode() + if mode == "gc" + CUDAnative.@cuda gc=true gc_config=gc_config $(args...) + elseif startswith(mode, "bump") + local capacity = 60 * MiB + if mode == "bump" + local buf = Mem.alloc(Mem.DeviceBuffer, capacity) + else + local buf = Mem.alloc(Mem.HostBuffer, capacity) + end + local start_address = pointer(buf) + local function init(kernel) + CUDAnative.Runtime.bump_alloc_init!(kernel, start_address, capacity) + end + @sync CUDAnative.@cuda init=init malloc="ptx_bump_alloc" $(args...) + Mem.free(buf) + else + @sync CUDAnative.@cuda $(args...) + end + end) +end + +suites = Dict() + +function register_cuda_benchmark(f, name, config) + suites[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90 +end + +benchmark_tags = [ + "gc", "gc-shared", + "gc-45mb", "gc-shared-45mb", + "gc-30mb", "gc-shared-30mb", + "gc-15mb", "gc-shared-15mb", + "gc-10mb", "gc-shared-10mb", + "nogc", "bump", "bump-pinned" +] + +macro cuda_benchmark(name, ex) + esc(quote + local suite = BenchmarkTools.BenchmarkGroup() + local function register_gc_shared(config, heap_size) + register_cuda_benchmark($name, config) do + global gc_mode = "gc" + global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=heap_size) + $(ex) + end + end + local function register_gc(config, heap_size) + register_cuda_benchmark($name, config) do + global gc_mode = "gc" + local local_arena_initial_size = div(heap_size, 10) + local global_arena_initial_size = heap_size - 8 * local_arena_initial_size + global gc_config = GCConfiguration( + local_arena_count=8, + local_arena_initial_size=local_arena_initial_size, + global_arena_initial_size=global_arena_initial_size) + $(ex) + end + end + + suites[$name] = BenchmarkTools.BenchmarkGroup(benchmark_tags) + register_gc("gc", 60 * MiB) + register_gc_shared("gc-shared", 60 * MiB) + register_gc("gc-45mb", 45 * MiB) + register_gc_shared("gc-shared-45mb", 45 * MiB) + register_gc("gc-30mb", 30 * MiB) + register_gc_shared("gc-shared-30mb", 30 * MiB) + register_gc("gc-15mb", 15 * MiB) + register_gc_shared("gc-shared-15mb", 15 * MiB) + register_gc("gc-10mb", 10 * MiB) + register_gc_shared("gc-shared-10mb", 10 * MiB) + register_cuda_benchmark($name, "nogc") do + global gc_mode = "nogc" + $(ex) + end + register_cuda_benchmark($name, "bump") do + global gc_mode = "bump" + $(ex) + end + register_cuda_benchmark($name, "bump-pinned") do + global gc_mode = "bump-pinned" + $(ex) + end + end) +end + +function run_benchmarks() + cache_dir = mkpath("gc-benchmarks/results-cache") + results = Dict() + for (name, group) in pairs(suites) + cache_path = "$cache_dir/$(replace(name, " " => "-")).json" + if isfile(cache_path) + group_results = open(cache_path, "r") do file + JSON.parse(file) + end + else + runs = BenchmarkTools.run(group) + median_times = BenchmarkTools.median(runs) + group_results = Dict(k => r.time for (k, r) in pairs(median_times)) + open(cache_path, "w") do file + JSON.print(file, group_results) + end + end + results[name] = group_results + end + return results +end diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index 8f97957b..653a4c17 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -29,12 +29,18 @@ include(joinpath("device", "pointer.jl")) include(joinpath("device", "array.jl")) include(joinpath("device", "cuda.jl")) include(joinpath("device", "llvm.jl")) +include(joinpath("device", "threading.jl")) + +# The interrupts and GC files need to be loaded _before_ the +# runtime intrinsics file, because some runtime intrinsics +# depend on the GC and the GC depends on interrupts. +include("interrupts.jl") +include("gc.jl") include(joinpath("device", "runtime.jl")) include("compiler.jl") include("execution.jl") include("reflection.jl") - include("deprecated.jl") include("init.jl") diff --git a/src/compiler/common.jl b/src/compiler/common.jl index 04c9f0a5..5604d617 100644 --- a/src/compiler/common.jl +++ b/src/compiler/common.jl @@ -12,12 +12,23 @@ struct CompilerJob maxthreads::Union{Nothing,CuDim} blocks_per_sm::Union{Nothing,Integer} maxregs::Union{Nothing,Integer} + # The name of the memory allocation function to use when allocating + # managed memory. A transform will rewrite all managed memory allocations + # to use this function instead. The 'malloc' signature must be + # 'void* malloc(size_t)' or compatible. + malloc::String + # Indicates whether the GPU GC or the "malloc never free" + # GC intrinsic lowering strategy is to be used. The former + # is used when this field is `true`; the latter when it is + # `false`. + gc::Bool name::Union{Nothing,String} CompilerJob(f, tt, cap, kernel; name=nothing, minthreads=nothing, maxthreads=nothing, - blocks_per_sm=nothing, maxregs=nothing) = - new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, name) + blocks_per_sm=nothing, maxregs=nothing, + malloc="malloc",gc=false) = + new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, malloc, gc, name) end # global job reference diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index ce3d7382..035238af 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -51,7 +51,7 @@ end function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true, dynamic_parallelism::Bool=true, optimize::Bool=true, - strip::Bool=false,strict::Bool=true) + strip::Bool=false, strict::Bool=true, internalize::Bool=true) ## Julia IR @timeit to[] "validation" check_method(job) @@ -91,12 +91,12 @@ function codegen(target::Symbol, job::CompilerJob; # always preload the runtime, and do so early; it cannot be part of any timing block # because it recurses into the compiler if libraries - runtime = load_runtime(job.cap) + runtime = load_runtime(job.cap, job.malloc) runtime_fns = LLVM.name.(defs(runtime)) end @timeit to[] "LLVM middle-end" begin - ir, kernel = @timeit to[] "IR generation" irgen(job, method_instance, world) + ir, kernel = @timeit to[] "IR generation" irgen(job, method_instance, world; internalize=internalize) if libraries undefined_fns = LLVM.name.(decls(ir)) @@ -154,7 +154,7 @@ function codegen(target::Symbol, job::CompilerJob; # cached compilation dyn_kernel_fn = get!(cache, dyn_job) do dyn_ir, dyn_kernel = codegen(:llvm, dyn_job; - optimize=optimize, strip=strip, + optimize=optimize, strip=strip, internalize=internalize, dynamic_parallelism=false, strict=false) dyn_kernel_fn = LLVM.name(dyn_kernel) link!(ir, dyn_ir) diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl index 2e3bd510..4c4699d3 100644 --- a/src/compiler/irgen.jl +++ b/src/compiler/irgen.jl @@ -137,7 +137,7 @@ function compile_method_instance(job::CompilerJob, method_instance::Core.MethodI return llvmf, dependencies end -function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world) +function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world; internalize::Bool=true) entry, dependencies = @timeit to[] "emission" compile_method_instance(job, method_instance, world) mod = LLVM.parent(entry) @@ -236,7 +236,26 @@ function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world) current_job = job linkage!(entry, LLVM.API.LLVMExternalLinkage) - internalize!(pm, [LLVM.name(entry)]) + if internalize + # We want to internalize functions so we can optimize + # them, but we don't really want to internalize globals + # because doing so may cause multiple copies of the same + # globals to appear after linking together modules. + # + # For example, the runtime library includes GC-related globals. + # It is imperative that these globals are shared by all modules, + # but if they are internalized before they are linked then + # they will actually not be internalized. + # + # Also, don't internalize the entry point, for obvious reasons. + non_internalizable_names = [LLVM.name(entry)] + for val in globals(mod) + if isa(val, LLVM.GlobalVariable) + push!(non_internalizable_names, LLVM.name(val)) + end + end + internalize!(pm, non_internalizable_names) + end add!(pm, ModulePass("LowerThrow", lower_throw!)) add!(pm, FunctionPass("HideUnreachable", hide_unreachable!)) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 1e76f146..976b700a 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -19,14 +19,14 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) # # NOTE: we need to use multiple distinct pass managers to force pass ordering; # intrinsics should never get lowered before Julia has optimized them. - if VERSION < v"1.2.0-DEV.375" + if VERSION < v"1.3.0-DEV.390" # with older versions of Julia, intrinsics are lowered unconditionally so we need to # replace them with GPU-compatible counterparts before anything else. that breaks # certain optimizations though: https://github.com/JuliaGPU/CUDAnative.jl/issues/340 ModulePassManager() do pm initialize!(pm) - add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!)) + add!(pm, FunctionPass("LowerGCFrame", eager_lower_gc_frame!)) aggressive_dce!(pm) # remove dead uses of ptls add!(pm, ModulePass("LowerPTLS", lower_ptls!)) run!(pm, mod) @@ -45,24 +45,27 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) ccall(:jl_add_optimization_passes, Cvoid, (LLVM.API.LLVMPassManagerRef, Cint, Cint), LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 0) + ccall(:LLVMExtraAddLateLowerGCFramePass, Cvoid, (LLVM.API.LLVMPassManagerRef,), LLVM.ref(pm)) run!(pm, mod) end ModulePassManager() do pm initialize!(pm) + if job.gc + add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry))) + add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!)) + add!(pm, FunctionPass("LowerArraysGPUGC", lower_array_calls_gc!)) + else + add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!)) + add!(pm, FunctionPass("LowerArraysNoGC", lower_array_calls_nogc!)) + end - # lower intrinsics - add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!)) aggressive_dce!(pm) # remove dead uses of ptls add!(pm, ModulePass("LowerPTLS", lower_ptls!)) - # the Julia GC lowering pass also has some clean-up that is required - if VERSION >= v"1.2.0-DEV.531" - late_lower_gc_frame!(pm) - end - run!(pm, mod) end + replace_malloc!(mod, job.malloc) end # PTX-specific optimizations @@ -296,6 +299,29 @@ function fixup_metadata!(f::LLVM.Function) end end +# Visits all calls to a particular intrinsic in a given LLVM module. +function visit_calls_to(visit_call::Function, name::AbstractString, mod::LLVM.Module) + if haskey(functions(mod), name) + func = functions(mod)[name] + + for use in uses(func) + call = user(use)::LLVM.CallInst + visit_call(call, func) + end + end +end + +# Deletes all calls to a particular intrinsic in a given LLVM module. +# Returns a Boolean that tells if any calls were actually deleted. +function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool + changed = false + visit_calls_to(name, mod) do call, _ + unsafe_delete!(LLVM.parent(call), call) + changed = true + end + return changed +end + # lower object allocations to to PTX malloc # # this is a PoC implementation that is very simple: allocate, and never free. it also runs @@ -304,7 +330,7 @@ end # is currently very architecture/CPU specific: hard-coded pool sizes, TLS references, etc. # such IR is hard to clean-up, so we probably will need to have the GC lowering pass emit # lower-level intrinsics which then can be lowered to architecture-specific code. -function lower_gc_frame!(fun::LLVM.Function) +function eager_lower_gc_frame!(fun::LLVM.Function) job = current_job::CompilerJob mod = LLVM.parent(fun) changed = false @@ -351,10 +377,729 @@ function lower_gc_frame!(fun::LLVM.Function) @compiler_assert isempty(uses(barrier)) job end +end + +# Visits all calls to a particular intrinsic in a given LLVM module +# and redirects those calls to a different function. +# Returns a Boolean that tells if any calls were actually redirected. +function redirect_calls_to!(from::AbstractString, to, mod::LLVM.Module)::Bool + changed = false + visit_calls_to(from, mod) do call, _ + args = collect(operands(call))[1:end - 1] + let builder = Builder(JuliaContext()) + position!(builder, call) + new_call = call!(builder, to, args) + replace_uses!(call, new_call) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + changed = true + end + return changed +end + +# Lowers the GC intrinsics produced by the LateLowerGCFrame pass to +# use the "malloc, never free" strategy. These intrinsics are the +# last point at which we can intervene in the pipeline before the +# passes that deal with them become CPU-specific. +function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module) + changed = false + + # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates + # store for an object, including headroom, but does not set the object's + # tag. + visit_calls_to("julia.gc_alloc_bytes", mod) do call, gc_alloc_bytes + gc_alloc_bytes_ft = eltype(llvmtype(gc_alloc_bytes))::LLVM.FunctionType + T_ret = return_type(gc_alloc_bytes_ft)::LLVM.PointerType + T_bitcast = LLVM.PointerType(T_ret, LLVM.addrspace(T_ret)) + + # Decode the call. + ops = collect(operands(call)) + size = ops[2] + + # We need to reserve a single pointer of headroom for the tag. + # (LateLowerGCFrame depends on us doing that.) + headroom = Runtime.tag_size + + # Call the allocation function and bump the resulting pointer + # so the headroom sits just in front of the returned pointer. + let builder = Builder(JuliaContext()) + position!(builder, call) + total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext())) + ptr = call!(builder, Runtime.get(:gc_pool_alloc), [total_size]) + cast_ptr = bitcast!(builder, ptr, T_bitcast) + bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())]) + result_ptr = bitcast!(builder, bumped_ptr, T_ret) + replace_uses!(call, result_ptr) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + + changed = true + end + + # Next up: 'julia.new_gc_frame'. This intrinsic allocates a new GC frame. + # We'll lower it as an alloca and hope SSA construction and DCE passes + # get rid of the alloca. This is a reasonable thing to hope for because + # all intrinsics that may cause the GC frame to escape will be replaced by + # nops. + visit_calls_to("julia.new_gc_frame", mod) do call, new_gc_frame + new_gc_frame_ft = eltype(llvmtype(new_gc_frame))::LLVM.FunctionType + T_ret = return_type(new_gc_frame_ft)::LLVM.PointerType + T_alloca = eltype(T_ret) + + # Decode the call. + ops = collect(operands(call)) + size = ops[1] + + let builder = Builder(JuliaContext()) + position!(builder, call) + ptr = array_alloca!(builder, T_alloca, size) + replace_uses!(call, ptr) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + + changed = true + end + + # The 'julia.get_gc_frame_slot' is closely related to the previous + # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of + # a slot in the GC frame. We can simply turn this intrinsic into a GEP. + visit_calls_to("julia.get_gc_frame_slot", mod) do call, _ + # Decode the call. + ops = collect(operands(call)) + frame = ops[1] + offset = ops[2] + + let builder = Builder(JuliaContext()) + position!(builder, call) + ptr = gep!(builder, frame, [offset]) + replace_uses!(call, ptr) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + + changed = true + end + + # The 'julia.push_gc_frame' registers a GC frame with the GC. We + # don't have a GC, so we can just delete calls to this intrinsic! + changed |= delete_calls_to!("julia.push_gc_frame", mod) + + # The 'julia.pop_gc_frame' unregisters a GC frame with the GC, so + # we can just delete calls to this intrinsic, too. + changed |= delete_calls_to!("julia.pop_gc_frame", mod) + + # Ditto for 'julia.queue_gc_root'. + changed |= delete_calls_to!("julia.queue_gc_root", mod) + + return changed +end + +# Emits instructions that allocate a particular number of bytes +# of GC-managed memory. No headroom is included. No tags are set. +function new_bytes!(builder::LLVM.Builder, malloc, size) + call!(builder, malloc, [size]) +end + +# Emits instructions that allocate bytes for an object, including +# headroom for the object's tag. Also fills in the object's tag if +# one is provided. +function new_object!(builder::LLVM.Builder, malloc, size, tag::Union{Type, Nothing} = nothing) + # We need to reserve a single pointer of headroom for the tag. + # (LateLowerGCFrame depends on us doing that.) + headroom = Runtime.tag_size + + # Call the allocation function and bump the resulting pointer + # so the headroom sits just in front of the returned pointer. + total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext())) + obj_ptr = new_bytes!(builder, malloc, total_size) + + jl_value_t = llvmtype(obj_ptr) + T_bitcast = LLVM.PointerType(jl_value_t, LLVM.addrspace(jl_value_t)) + + ptr = bitcast!(builder, obj_ptr, T_bitcast) + if tag != nothing + # Fill in the tag if we have one. + store!( + builder, + inttoptr!( + builder, + ConstantInt( + convert(LLVMType, Int64), + Int64(pointer_from_objref(tag))), + jl_value_t), + ptr) + end + bumped_ptr = gep!(builder, ptr, [ConstantInt(Int32(1), JuliaContext())]) + return bitcast!(builder, bumped_ptr, jl_value_t) +end + +""" +lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) + +An LLVM pass that lowers the GC intrinsics produced by the +LateLowerGCFrame pass to use the GPU GC. These intrinsics are the +last point at which we can intervene in the pipeline before the +passes that deal with them become CPU-specific. +""" +function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) + changed = false + + # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates + # store for an object, including headroom, but does not set the object's + # tag. + visit_calls_to("julia.gc_alloc_bytes", mod) do call, gc_alloc_bytes + # Decode the call. + ops = collect(operands(call)) + size = ops[2] + + # We need to reserve a single pointer of headroom for the tag. + # (LateLowerGCFrame depends on us doing that.) + headroom = Runtime.tag_size + + # Call the allocation function and bump the resulting pointer + # so the headroom sits just in front of the returned pointer. + let builder = Builder(JuliaContext()) + position!(builder, call) + result_ptr = new_object!(builder, Runtime.get(:gc_malloc_object), size) + replace_uses!(call, result_ptr) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + + changed = true + end + + # Next up: 'julia.new_gc_frame'. This intrinsic allocates a new GC frame. + # We actually have a call that implements this intrinsic. Let's use that. + changed |= redirect_calls_to!("julia.new_gc_frame", Runtime.get(:new_gc_frame), mod) + + # The 'julia.get_gc_frame_slot' is closely related to the previous + # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of + # a slot in the GC frame. We can simply turn this intrinsic into a GEP. + visit_calls_to("julia.get_gc_frame_slot", mod) do call, _ + # Decode the call. + ops = collect(operands(call)) + frame = ops[1] + offset = ops[2] + + let builder = Builder(JuliaContext()) + position!(builder, call) + ptr = gep!(builder, frame, [offset]) + replace_uses!(call, ptr) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + + changed = true + end + + # The 'julia.push_gc_frame' registers a GC frame with the GC. We will + # call a function that does just this. + changed |= redirect_calls_to!("julia.push_gc_frame", Runtime.get(:push_gc_frame), mod) + + # The 'julia.pop_gc_frame' unregisters a GC frame with the GC. We again + # have a function in the runtime library. + changed |= redirect_calls_to!("julia.pop_gc_frame", Runtime.get(:pop_gc_frame), mod) + + # Delete calls to 'julia.queue_gc_root'. + changed |= delete_calls_to!("julia.queue_gc_root", mod) return changed end +# Tells if a function manages a GC frame. +function has_gc_frame(fun::LLVM.Function) + for insn in instructions(entry(fun)) + if isa(insn, LLVM.CallInst) + callee = called_value(insn) + if isa(callee, LLVM.Function) && LLVM.name(callee) == "julia.new_gc_frame" + return true + end + end + end + return false +end + +# Tells if an instruction is a call to a non-intrinsic callee. +function is_non_intrinsic_call(instruction::LLVM.Instruction) + if isa(instruction, LLVM.CallInst) + callee = called_value(instruction) + if isa(callee, LLVM.Function) + callee_name = LLVM.name(callee) + return !startswith(callee_name, "julia.") && !startswith(callee_name, "llvm.") + else + return true + end + else + return false + end +end + +""" + insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function) + +An LLVM pass that inserts GC safepoints in such a way that threads +reach a safepoint after a reasonable amount of time. + +Moreover, this pass also inserts perma-safepoints after entry point returns. +Perma-safepoints inform the GC that it doesn't need to wait for a warp to +reach a safepoint; inserting them stops the GC from deadlocking. +""" +function insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function) + # Insert a safepoint before every function call, but only for + # functions that manage a GC frame. + # + # TODO: also insert safepoints on loop back-edges? This is what people + # usually do, but it requires nontrivial IR analyses that the LLVM C + # API doesn't expose. + + if has_gc_frame(fun) + safepoint_function = Runtime.get(:gc_safepoint) + let builder = Builder(JuliaContext()) + for block in blocks(fun) + for instruction in instructions(block) + if is_non_intrinsic_call(instruction) + if called_value(instruction) == safepoint_function + continue + end + + # Insert a safepoint just before the call. + position!(builder, instruction) + debuglocation!(builder, instruction) + call!(builder, safepoint_function, LLVM.Value[]) + end + end + end + dispose(builder) + end + end + + # Insert perma-safepoints if necessary. + if fun == entry + # Looks like we're going to have to insert perma-safepoints. + # We need to keep in mind that perma-safepoints are per-warp, + # so we absolutely cannot allow warps to be in a divergent + # state when a perma-safepoint is set---all bets are off if + # that happens anyway. + # + # To make sure that we don't end up in that situation, + # we will create a dedicated return block and replace all 'ret' + # instructions by jumps to that return block. + + # Create the dedicated return block. + return_block = BasicBlock(fun, "kernel_exit") + let builder = Builder(JuliaContext()) + position!(builder, return_block) + call!(builder, Runtime.get(:gc_perma_safepoint), LLVM.Value[]) + ret!(builder) + dispose(builder) + end + + # Rewrite return instructions as branches to the return bloc. + for block in blocks(fun) + if block == return_block + # We need to be careful not to trick ourselves into + # turning the return block's 'ret' into an infinite loop. + continue + end + term = terminator(block) + if isa(term, LLVM.RetInst) + unsafe_delete!(block, term) + let builder = Builder(JuliaContext()) + position!(builder, block) + br!(builder, return_block) + dispose(builder) + end + end + end + end + return true +end + +# Tries to evaluate an LLVM IR constant as a literal pointer. +function to_literal_pointer(value)::Tuple{Bool, Ptr{Cvoid}} + if !isa(value, LLVM.ConstantExpr) + return (false, C_NULL) + end + + if !occursin("inttoptr", string(value)) + return (false, C_NULL) + end + + # Peel off addrspacecast and inttoptr. + ptr_arg = value + while occursin("addrspacecast", string(ptr_arg)) || occursin("inttoptr", string(ptr_arg)) + ptr_arg = first(operands(ptr_arg)) + end + ptr_val = convert(Int, ptr_arg) + (true, Ptr{Cvoid}(ptr_val)) +end + +# Visits all calls to literal pointers in a function. +function visit_literal_pointer_calls(visit_call::Function, fun::LLVM.Function) + for block in blocks(fun) + for call in instructions(block) + if !isa(call, LLVM.CallInst) + continue + end + + callee = called_value(call) + if !isa(callee, LLVM.ConstantExpr) + continue + end + + # detect calls to literal pointers + # FIXME: can we detect these properly? + # FIXME: jl_apply_generic and jl_invoke also have such arguments + is_ptr, ptr = to_literal_pointer(callee) + if is_ptr + # look it up in the Julia JIT cache + frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0) + if length(frames) >= 1 + # @compiler_assert length(frames) == 1 job frames=frames + fn, file, line, linfo, fromC, inlined, ip = last(frames) + visit_call(call, fn) + end + end + end + end +end + +# Emits instructions that create a new array. The array's element type +# must be statically known. Its dimensions are represented as a tuple +# of LLVM IR values. A pointer to the new array is returned. +function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple; data_ptr::Union{Nothing,LLVM.Value} = nothing) + # Since time immemorial, the structure of an array is (quoting from the + # Julia source code here): + # + # typedef struct { + # /* + # how - allocation style + # 0 = data is inlined, or a foreign pointer we don't manage + # 1 = julia-allocated buffer that needs to be marked + # 2 = malloc-allocated pointer this array object manages + # 3 = has a pointer to the object that owns the data + # */ + # uint16_t how:2; + # uint16_t ndims:10; + # uint16_t pooled:1; + # uint16_t ptrarray:1; // representation is pointer array + # uint16_t isshared:1; // data is shared by multiple Arrays + # uint16_t isaligned:1; // data allocated with memalign + # } jl_array_flags_t; + # + # JL_EXTENSION typedef struct { + # JL_DATA_TYPE + # void *data; + # #ifdef STORE_ARRAY_LEN + # size_t length; + # #endif + # jl_array_flags_t flags; + # uint16_t elsize; + # uint32_t offset; // for 1-d only. does not need to get big. + # size_t nrows; + # union { + # // 1d + # size_t maxsize; + # // Nd + # size_t ncols; + # }; + # // other dim sizes go here for ndims > 2 + # + # // followed by alignment padding and inline data, or owner pointer + # } jl_array_t; + # + # where `STORE_ARRAY_LEN` is a preprocessor directive that is technically a + # "configuration option." AFAICT, `STORE_ARRAY_LEN` is just always defined in + # practice. + # + # The Julia compiler is more than happy to eagerly generate code that accesses + # fields of this data structure directly, so we can't invent our own array data + # structure. Consequently, we will emit code here that carefully constructs + # an instance of `jl_array_t`. + # + # To keep things tidy, we'll construct an array (ironic, I know) that contains the + # values we'll assign to each field of the array. After that, we will generate + # code that fills in every field in one fell swoop. + + fields = [] + + # Compute the size of the element type. + element_type = eltype(array_type) + llvm_element_type = convert(LLVMType, element_type, true) + mod = LLVM.parent(LLVM.parent(position(builder))) + layout = datalayout(mod) + element_size = Csize_t(sizeof(layout, llvm_element_type)) + + # Compute the number of elements in the array. + element_count = LLVM.ConstantInt(convert(LLVMType, Csize_t), 1) + for i in dims + element_count = mul!(builder, element_count, intcast!(builder, i, convert(LLVMType, Csize_t))) + end + + # Compute the size of the array's elements in bytes. + data_bytesize = mul!( + builder, + LLVM.ConstantInt(convert(LLVMType, Csize_t), element_size), + element_count) + + if element_size == Csize_t(1) && length(dims) == 1 + # If we're allocating an array of bytes, we will throw in an extra + # byte at the end for compatibility with Julia's ABI. + data_bytesize = add!(builder, data_bytesize, LLVM.ConstantInt(convert(LLVMType, Csize_t), 1)) + end + + # Actually allocate the array's contents. We will just always + # use a separate buffer. Inline data storage is wasteful and + # harder to implement. + if data_ptr == nothing + data_ptr = new_bytes!(builder, malloc, data_bytesize) + end + + # The pointer to the array's data is the first field of the struct. + push!(fields, data_ptr) + + # The array's length (i.e., the product of its dimensions) is the + # second field of the `jl_array_t` struct. + push!(fields, element_count) + + # Synthesize a constant that represents the array's flags. + flags = Int16(0) + # Set the 'how' field to one. + flags |= Int16(1) + # Set the 'nDims' field. + flags <<= 10 + flags |= Int16(length(dims)) + # Set the 'pooled' field to `false`. + flags <<= 1 + flags |= Int16(false) + # Set the 'ptrarray' field. + flags <<= 1 + flags |= Int16(isa(llvm_element_type, LLVM.PointerType)) + # Set the 'isshared' field to `false`. + flags <<= 1 + flags |= Int16(false) + # Set the 'isaligned' field to `true`. + flags <<= 1 + flags |= Int16(true) + # Add the flags to the `jl_array_t` struct. + push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), flags)) + + # Set the 'elsize' field. + push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), Int16(element_size))) + + # Set the 'offset' field to zero (the array is not a slice). + push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), Int16(0))) + + if length(dims) == 1 + # Set the 'nrows' field to the number of elements. + push!(fields, element_count) + # Ditto for the 'maxsize' field. + push!(fields, element_count) + else + # If we're creating a multi-dimensional array, then the + # process is slightly different. + for i in dims + push!(fields, intcast!(builder, i, convert(LLVMType, Csize_t))) + end + end + + # Synthesize a struct type that neatly represents the data we want + # to store. + struct_type = LLVM.StructType([llvmtype(f) for f in fields]) + + # We now know exactly what data we want to store in each field of the + # array's control structure. + # All that's left is to actually allocate the array and write that data + # to the control structure. + obj_ptr = new_object!( + builder, + malloc, + ConstantInt(convert(LLVMType, Csize_t), sizeof(layout, struct_type)), + array_type) + struct_ptr = bitcast!( + builder, + addrspacecast!( + builder, + obj_ptr, + LLVM.PointerType(eltype(llvmtype(obj_ptr)))), + LLVM.PointerType(struct_type)) + + for i in 1:length(fields) + val = fields[i] + gep = struct_gep!(builder, struct_ptr, i - 1) + store!(builder, val, gep) + end + + return obj_ptr +end + +# Generates code that extracts array dimensions from a tuple argument. +function extract_array_dims!(builder, ::Type{Array{T, N}}, dims_tuple) where {T, N} + # First cast the tuple value to a size_t pointer in address space zero. + tuple_as_size_t = bitcast!( + builder, + addrspacecast!( + builder, + dims_tuple, + LLVM.PointerType(eltype(llvmtype(dims_tuple)))), + LLVM.PointerType(convert(LLVMType, Csize_t))) + + is_literal, ptr = to_literal_pointer(tuple_as_size_t) + + results = [] + if is_literal + # If the tuple is implemented as a literal pointer, then we want to load its elements + # ahead of time; the device won't be able to access host-allocated constants. + for i in 1:N + value = Base.unsafe_load(Base.unsafe_convert(Ptr{Csize_t}, ptr), i) + push!(results, LLVM.ConstantInt(convert(LLVMType, Csize_t), value)) + end + else + # Otherwise, generate code that loads fields from the tuple. + for i in 1:N + address = gep!( + builder, + tuple_as_size_t, + [LLVM.ConstantInt(convert(LLVMType, Int32), i - 1)]) + + push!(results, load!(builder, address)) + end + end + return Tuple(results) +end + +# Lowers function calls that pertain to array operations. +function lower_array_calls!(fun::LLVM.Function, malloc) + changed_any = false + alloc_methods = [ + :jl_alloc_array_1d, + :jl_alloc_array_2d, + :jl_alloc_array_3d, + :jl_new_array + ] + wrap_methods = [ + :jl_ptr_to_array, + :jl_ptr_to_array_1d + ] + runtime_methods = [ + :jl_array_grow_at, + :jl_array_grow_beg, + :jl_array_grow_end, + :jl_array_del_at, + :jl_array_del_beg, + :jl_array_del_end, + :jl_array_sizehint + ] + visit_literal_pointer_calls(fun) do call, name + args = collect(operands(call))[1:end - 1] + if name in alloc_methods + is_ptr, array_type_ptr = to_literal_pointer(args[1]) + if is_ptr + # We can lower array creation calls if we know the type + # of the array to create in advance. + array_type = unsafe_pointer_to_objref(array_type_ptr) + let builder = Builder(JuliaContext()) + position!(builder, call) + if name == :jl_new_array + # jl_new_array requires special treatment. All the other ones are + # pretty simple to handle. + dim_args = extract_array_dims!(builder, array_type, args[2]) + else + dim_args = Tuple(args[2:end]) + end + new_array = new_array!(builder, malloc, array_type, dim_args) + replace_uses!(call, new_array) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + changed_any = true + end + elseif name in wrap_methods + is_ptr, array_type_ptr = to_literal_pointer(args[1]) + if is_ptr + # We can lower array wrapping calls if we know the type + # of the array to create in advance. + array_type = unsafe_pointer_to_objref(array_type_ptr) + let builder = Builder(JuliaContext()) + position!(builder, call) + if name == :jl_ptr_to_array + dim_args = extract_array_dims!(builder, array_type, args[3]) + else + dim_args = (args[3],) + end + new_array = new_array!(builder, malloc, array_type, dim_args; data_ptr=args[2]) + replace_uses!(call, new_array) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + changed_any = true + end + elseif name in runtime_methods + let builder = Builder(JuliaContext()) + position!(builder, call) + new_call = call!(builder, Runtime.get(name), args) + replace_uses!(call, new_call) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + changed_any = true + end + end + return changed_any +end + +function lower_array_calls_gc!(fun::LLVM.Function) + lower_array_calls!(fun, Runtime.get(:gc_malloc_object)) +end + +function lower_array_calls_nogc!(fun::LLVM.Function) + lower_array_calls!(fun, Runtime.get(:gc_pool_alloc)) +end + +# Replaces all uses of a function in a particular module with +# a compatible function. +function replace_function!(mod::LLVM.Module, old_name::String, new_name::String) + if new_name == old_name + # There's nothing to replace if the new function is the same as + # the old function. + return false + end + + # Otherwise, we'll try and find the old function. + if !haskey(functions(mod), old_name) + # If the old function doesn't even appear in the module, then it's not in + # use and we can stop right here. + return false + end + + old_function = functions(mod)[old_name] + + if haskey(functions(mod), new_name) + new_function = functions(mod)[new_name] + else + # Create a new function. + new_function = LLVM.Function( + mod, + new_name, + eltype(llvmtype(old_function)::LLVM.PointerType)::LLVM.FunctionType) + end + + # Replace all uses of the old function with the new function. + replace_uses!(old_function, new_function) + + return true +end + +# Replaces all uses of the managed memory allocation function in a +# particular module with a compatible function with the specified name. +function replace_malloc!(mod::LLVM.Module, malloc_name::String) + return replace_function!(mod, "julia.managed_malloc", malloc_name) +end + # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible. # # this assumes and checks that the TLS is unused, which should be the case for most GPU code diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl index 3d5f33ac..bfff7454 100644 --- a/src/compiler/rtlib.jl +++ b/src/compiler/rtlib.jl @@ -122,26 +122,30 @@ end ## functionality to build the runtime library -function emit_function!(mod, cap, f, types, name) +function emit_function!(mod, cap, f, types, name, malloc) tt = Base.to_tuple_type(types) - new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false); - libraries=false, strict=false) + # Optimize the module that defines the function, but don't + # internalize symbols in that function yet: internalizing + # globals may de-alias references to globals in the runtime + # library from equivalent references in the kernel. + new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false; malloc=malloc); + libraries=false, strict=false, internalize=false) LLVM.name!(entry, name) link!(mod, new_mod) end -function build_runtime(cap) +function build_runtime(cap, malloc) mod = LLVM.Module("CUDAnative run-time library", JuliaContext()) for method in values(Runtime.methods) - emit_function!(mod, cap, method.def, method.types, method.llvm_name) + emit_function!(mod, cap, method.def, method.types, method.llvm_name, malloc) end mod end -function load_runtime(cap) - name = "cudanative.$(cap.major)$(cap.minor).bc" +function load_runtime(cap, malloc) + name = "cudanative.$(malloc).$(cap.major)$(cap.minor).bc" path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name) mkpath(dirname(path)) @@ -151,8 +155,8 @@ function load_runtime(cap) parse(LLVM.Module, read(io), JuliaContext()) end else - @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device, this might take a while..." - lib = build_runtime(cap) + @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device (allocating with '$malloc'), this might take a while..." + lib = build_runtime(cap, malloc) open(path, "w") do io write(io, lib) end diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl index af629ac7..60bf4a7a 100644 --- a/src/compiler/validation.jl +++ b/src/compiler/validation.jl @@ -231,7 +231,7 @@ function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst) end # detect calls to undefined functions - if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns) + if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns) && fn != job.malloc # figure out if the function lives in the Julia runtime library if libjulia[] == C_NULL paths = filter(Libdl.dllist()) do path diff --git a/src/device/runtime.jl b/src/device/runtime.jl index b3addaf0..a54c629c 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -12,8 +12,9 @@ module Runtime using ..CUDAnative using LLVM using LLVM.Interop +using CUDAdrv - +import ..CUDAnative: GCFrame ## representation of a runtime method instance struct RuntimeMethodInstance @@ -127,8 +128,35 @@ function T_prjlvalue() LLVM.PointerType(eltype(T_pjlvalue), Tracked) end +# A function that gets replaced by the proper 'malloc' implementation +# for the context it executes in. When the GC is used, calls to this +# function are replaced with 'gc_malloc'; otherwise, this function gets +# rewritten as a call to the allocator, probably 'malloc'. +@generated function managed_malloc(sz::Csize_t) + T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext())) + T_size = convert(LLVMType, Csize_t) + T_ptr = convert(LLVMType, Ptr{UInt8}) + + # create function + llvm_f, _ = create_function(T_ptr, [T_size]) + mod = LLVM.parent(llvm_f) + + intr = LLVM.Function(mod, "julia.managed_malloc", LLVM.FunctionType(T_pint8, [T_size])) + + # generate IR + Builder(JuliaContext()) do builder + entry = BasicBlock(llvm_f, "entry", JuliaContext()) + position!(builder, entry) + ptr = call!(builder, intr, [parameters(llvm_f)[1]]) + jlptr = ptrtoint!(builder, ptr, T_ptr) + ret!(builder, jlptr) + end + + call_function(llvm_f, Ptr{UInt8}, Tuple{Csize_t}, :((sz,))) +end + function gc_pool_alloc(sz::Csize_t) - ptr = malloc(sz) + ptr = managed_malloc(sz) if ptr == C_NULL @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) throw(OutOfMemoryError()) @@ -138,7 +166,6 @@ end compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue) - ## boxing and unboxing const tag_type = UInt @@ -226,5 +253,357 @@ for (T, t) in [Int8 => :int8, Int16 => :int16, Int32 => :int32, Int64 => end end +## Garbage collection + +# LLVM type of a pointer to a tracked pointer +function T_pprjlvalue() + T_pjlvalue = convert(LLVMType, Any, true) + LLVM.PointerType( + LLVM.PointerType(eltype(T_pjlvalue), Tracked)) +end + +# Include GC memory allocation functions into the runtime. +compile(CUDAnative.gc_malloc, Ptr{UInt8}, (Csize_t,)) +compile(CUDAnative.gc_malloc_object, Any, (Csize_t,), T_prjlvalue) + +# Include GC frame management functions into the runtime. +compile(CUDAnative.new_gc_frame, Any, (Cuint,), T_pprjlvalue) + +compile( + CUDAnative.push_gc_frame, + Nothing, + (GCFrame, Cuint), + () -> convert(LLVMType, Cvoid), + () -> [T_pprjlvalue(), convert(LLVMType, UInt32)]) + +compile( + CUDAnative.pop_gc_frame, + Nothing, + (GCFrame,), + () -> convert(LLVMType, Cvoid), + () -> [T_pprjlvalue()]) + +# Also import the safepoint and perma-safepoint functions. +compile(CUDAnative.gc_safepoint, Cvoid, ()) +compile(CUDAnative.gc_perma_safepoint, Cvoid, ()) + +## Bump allocator. + +# Allocates `bytesize` bytes of storage by bumping the global bump +# allocator pointer. +function bump_alloc(bytesize::Csize_t)::Ptr{UInt8} + ptr = CUDAnative.@cuda_global_ptr("bump_alloc_ptr", Csize_t) + chunk_address = CUDAnative.atomic_add!(ptr, bytesize) + end_ptr = unsafe_load(CUDAnative.@cuda_global_ptr("bump_alloc_end", Csize_t)) + if chunk_address < end_ptr + return Ptr{UInt8}(chunk_address) + else + return C_NULL + end +end + +compile(bump_alloc, Ptr{UInt8}, (Csize_t,)) + +function maybe_set_global(kernel, name, value::T) where T + try + global_handle = CuGlobal{T}(kernel.mod, name) + set(global_handle, value) + catch exception + # The interrupt pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end +end + +function bump_alloc_init!(kernel, buffer_start, buffer_size) + maybe_set_global(kernel, "bump_alloc_ptr", buffer_start) + maybe_set_global(kernel, "bump_alloc_end", buffer_start + buffer_size) +end + +## Arrays + +# A data structure that carefully mirrors an in-memory array control +# structure for Julia arrays, as laid out by the compiler. +mutable struct Array1D + # This is the data layout for Julia arrays, which we adhere to here. + # + # JL_EXTENSION typedef struct { + # JL_DATA_TYPE + # void *data; + # #ifdef STORE_ARRAY_LEN + # size_t length; + # #endif + # jl_array_flags_t flags; + # uint16_t elsize; + # uint32_t offset; // for 1-d only. does not need to get big. + # size_t nrows; + # union { + # // 1d + # size_t maxsize; + # // Nd + # size_t ncols; + # }; + # // other dim sizes go here for ndims > 2 + # + # // followed by alignment padding and inline data, or owner pointer + # } jl_array_t; + + data::Ptr{UInt8} + length::Csize_t + flags::UInt16 + elsize::UInt16 + offset::UInt32 + nrows::Csize_t + maxsize::Csize_t +end + +function zero_fill!(ptr::Ptr{UInt8}, count::Integer) + for i in 1:count + unsafe_store!(ptr, UInt8(0), count) + end + return +end + +function memmove!(dst::Ptr{UInt8}, src::Ptr{UInt8}, sz::Integer) + if dst < src + for i in 1:sz + unsafe_store!(dst, unsafe_load(src, i), i) + end + else + for i in sz:-1:1 + unsafe_store!(dst, unsafe_load(src, i), i) + end + end + return +end + +# Resize the buffer to a max size of `newlen` +# The buffer can either be newly allocated or realloc'd, the return +# value is true if a new buffer is allocated and false if it is realloc'd. +# the caller needs to take care of moving the data from the old buffer +# to the new one if necessary. +# When this function returns, the `.data` pointer always points to +# the **beginning** of the new buffer. +function array_resize_buffer(a::Array1D, newlen::Csize_t)::Bool + elsz = Csize_t(a.elsize) + nbytes = newlen * elsz + oldnbytes = a.maxsize * elsz + + if elsz == 1 + nbytes += 1 + oldnbytes += 1 + end + + # Allocate a new buffer. 'managed_malloc' will get replaced with + # the "right" allocation function for the environment in which this + # function is compiled. So if the GC is enabled, then 'managed_malloc' + # will actually call 'gc_malloc'; otherwise, it's probably going to + # be 'malloc'. + a.data = managed_malloc(nbytes) + zero_fill!(a.data + oldnbytes, nbytes - oldnbytes) + a.maxsize = newlen + return true +end + +""" + jl_array_grow_at_impl(a, idx, inc, n) + +Grows one-dimensional array `a` containing `n` elements by `inc` elements at +zero-based index `idx`. +""" +function jl_array_grow_at_impl(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t) + data = a.data + elsz = Csize_t(a.elsize) + reqmaxsize = a.offset + n + inc + has_gap = n > idx + nb1 = idx * elsz + nbinc = inc * elsz + if reqmaxsize > a.maxsize + if reqmaxsize < 4 + newmaxsize = Csize_t(4) + elseif reqmaxsize >= a.maxsize * 2 + newmaxsize = reqmaxsize + else + newmaxsize = a.maxsize * 2 + end + + newbuf = array_resize_buffer(a, newmaxsize) + newdata = a.data + a.offset * elsz + if newbuf + memmove!(newdata, data, nb1) + if has_gap + memmove!(newdata + nb1 + nbinc, data + nb1, n * elsz - nb1) + end + elseif has_gap + memmove!(newdata + nb1 + nbinc, newdata + nb1, n * elsz - nb1) + end + a.data = data = newdata + elseif has_gap + memmove!(data + nb1 + nbinc, data + nb1, n * elsz - nb1) + end + + newnrows = n + inc + a.length = newnrows + a.nrows = newnrows + zero_fill!(data + nb1, nbinc) + return +end + +""" + jl_array_grow_at(a, idx, inc) + +Grows one-dimensional array `a` by `inc` elements at zero-based index `idx`. +""" +function jl_array_grow_at(a::Array1D, idx::Cssize_t, inc::Csize_t) + jl_array_grow_at_impl(a, Csize_t(idx), inc, a.nrows) + return +end + +compile( + jl_array_grow_at, + Cvoid, + (Array1D, Cssize_t, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Cssize_t), convert(LLVMType, Csize_t)]) + +""" + jl_array_grow_end(a, inc) + +Grows one-dimensional array `a` by `inc` elements at the end. +""" +function jl_array_grow_end(a::Array1D, inc::Csize_t) + n = a.nrows + jl_array_grow_at_impl(a, n, inc, n) + return +end + +compile( + jl_array_grow_end, + Cvoid, + (Array1D, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) + +""" + jl_array_grow_beg(a, inc) + +Grows one-dimensional array `a` by `inc` elements at the beginning of the array. +""" +function jl_array_grow_beg(a::Array1D, inc::Csize_t) + jl_array_grow_at_impl(a, Csize_t(0), inc, a.nrows) + return +end + +compile( + jl_array_grow_beg, + Cvoid, + (Array1D, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) + +""" + jl_array_sizehint(a, sz) + +Suggest that one-dimensional array `a` reserve capacity for at least `sz` elements. +""" +function jl_array_sizehint(a::Array1D, sz::Csize_t) + n = a.length + data = a.data + elsz = Csize_t(a.elsize) + reqmaxsize = a.offset + sz + if reqmaxsize > a.maxsize + newbuf = array_resize_buffer(a, reqmaxsize) + newdata = a.data + a.offset * elsz + if newbuf + memmove!(newdata, data, n * elsz) + end + a.data = data = newdata + end + return +end + +compile( + jl_array_sizehint, + Cvoid, + (Array1D, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) + +""" + jl_array_del_at_impl(a, idx, dec, n) + +Removes `dec` elements from one-dimensional array `a`, starting at zero-based index `idx`. +`n` is the number of elements in `a`. +""" +function jl_array_del_at_impl(a::Array1D, idx::Csize_t, dec::Csize_t, n::Csize_t) + data = a.data + elsz = a.elsize + last = idx + dec + if n > last + memmove!(data + idx * elsz, data + last * elsz, (n - last) * elsz) + end + n -= dec + if elsz == 1 + Base.unsafe_store!(data, n + 1, UInt8(0)) + end + a.nrows = n + a.length = n + return +end + +""" + jl_array_del_beg(a, dec) + +Removes `dec` elements from the beginning of one-dimensional array `a`. +""" +function jl_array_del_beg(a::Array1D, dec::Csize_t) + jl_array_del_at_impl(a, Csize_t(0), dec, a.nrows) + return +end + +compile( + jl_array_del_beg, + Cvoid, + (Array1D, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) + +""" + jl_array_del_end(a, dec) + +Removes `dec` elements from the end of one-dimensional array `a`. +""" +function jl_array_del_end(a::Array1D, dec::Csize_t) + n = a.nrows + jl_array_del_at_impl(a, n, dec, n) + return +end + +compile( + jl_array_del_end, + Cvoid, + (Array1D, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) + + +""" + jl_array_del_at(a, idx, dec) + +Removes `dec` elements from one-dimensional array `a`, starting at zero-based index `idx`. +""" +function jl_array_del_at(a::Array1D, idx::Cssize_t, dec::Csize_t) + jl_array_del_at_impl(a, Csize_t(idx), dec, a.nrows) + return +end + +compile( + jl_array_del_at, + Cvoid, + (Array1D, Cssize_t, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Cssize_t), convert(LLVMType, Csize_t)]) end diff --git a/src/device/threading.jl b/src/device/threading.jl new file mode 100644 index 00000000..96e58f72 --- /dev/null +++ b/src/device/threading.jl @@ -0,0 +1,276 @@ +# This file implements threading primitives that work for CUDAnative kernels. + +export ReaderWriterLock, reader_locked, writer_locked, Mutex, try_lock, unlock + +# Gets a pointer to a global with a particular name. If the global +# does not exist yet, then it is declared in the global memory address +# space. +@generated function atomic_compare_exchange!(ptr::Ptr{T}, cmp::T, new::T)::T where T + ptr_type = convert(LLVMType, Ptr{T}) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 acq_rel acquire + %rv = extractvalue { $lt, i1 } %result, 0 + ret $lt %rv + """ + :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new)) +end + +@generated function atomic_rmw!(::Val{op}, lhs::Ptr{T}, rhs::T)::T where {op, T} + ptr_type = convert(LLVMType, Ptr{T}) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + %rv = atomicrmw volatile $(String(op)) $lt* %ptr, $lt %1 acq_rel + ret $lt %rv + """ + :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T}, lhs, rhs)) +end + +# Atomically adds a value to a variable pointed to by a pointer. +# Returns the previous value stored in that variable. +function atomic_add!(lhs::Ptr{T}, rhs::T)::T where T + atomic_rmw!(Val(:add), lhs, rhs) +end + +# Atomically subtracts a value from a variable pointed to by a pointer. +# Returns the previous value stored in that variable. +function atomic_subtract!(lhs::Ptr{T}, rhs::T)::T where T + atomic_rmw!(Val(:sub), lhs, rhs) +end + +# Atomically computes the logical or of a value and a variable pointed +# to by a pointer. Returns the previous value stored in that variable. +function atomic_or!(lhs::Ptr{T}, rhs::T)::T where T + atomic_rmw!(Val(:or), lhs, rhs) +end + +# Atomically assigns a new value to a variable pointed to by a pointer. +# Returns the previous value stored in that variable. +function atomic_exchange!(lhs::Ptr{T}, rhs::T)::T where T + atomic_rmw!(Val(:xchg), lhs, rhs) +end + +# Loads a value from a pointer. +@generated function volatile_load(ptr::Ptr{T})::T where T + ptr_type = string(convert(LLVMType, Ptr{T})) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + %rv = load volatile $lt, $lt* %ptr + ret $lt %rv + """ + :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr)) +end + +# Stores a value at a particular address. +@generated function volatile_store!(ptr::Ptr{T}, value::T) where T + ptr_type = string(convert(LLVMType, Ptr{T})) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + store volatile $lt %1, $lt* %ptr + ret void + """ + :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value)) +end + +function unwrap_device_ptr(ptr::DevicePtr{T, A})::Ptr{T} where {T, A} + convert(Ptr{T}, convert(Csize_t, ptr)) +end + +const ReaderWriterLockState = Int64 + +""" +A reader-writer lock: a lock that supports concurrent access for +read operations and exclusive access for write operations. +""" +struct ReaderWriterLock + # A pointer to the reader-writer lock's state. The state + # is a counter that can be in one of the following states: + # + # * > 0: the lock is acquired by one or more readers. + # The state counter describes the number of readers + # that have acquired the lock. + # + # * = 0: the lock is idle. + # + # * < 0: the lock is acquired by a single writer. + # + state_ptr::Ptr{ReaderWriterLockState} +end + +ReaderWriterLock(state_ptr::DevicePtr{ReaderWriterLockState}) = + ReaderWriterLock(unwrap_device_ptr(state_ptr)) + +const max_rw_lock_readers = (1 << (sizeof(ReaderWriterLockState) * 8 - 1)) + +# Serializes execution of a function within a warp, to combat thread +# divergence-related deadlocks. +function warp_serialized(func::Function) + # Get the current thread's ID. + thread_id = threadIdx().x - 1 + + # Get the size of a warp. + size = warpsize() + + local result + i = 0 + while i < size + if thread_id % size == i + result = func() + end + i += 1 + end + return result +end + +""" + reader_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true) + +Acquires a reader-writer lock in reader mode, runs `func` while the lock is +acquired and releases the lock again. +""" +function reader_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true) + if !acquire_lock + return func() + end + + while true + # Increment the reader count. If the lock is in write-acquired mode, + # then the lock will stay in that mode (unless the reader count is + # exceeded, but that is virtually impossible). Otherwise, the lock + # will end up in read-acquired mode. + previous_state = atomic_add!(lock.state_ptr, 1) + + # If the lock was in the idle or read-acquired state, then + # it is now in read-acquired mode. + if previous_state >= 0 + # Run the function. + result = func() + # Decrement the reader count to release the reader lock. + atomic_add!(lock.state_ptr, -1) + # We're done here. + return result + end + + # Decrement the reader count and try again. + atomic_add!(lock.state_ptr, -1) + end +end + +""" + writer_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true) + +Acquires a reader-writer lock in writer mode, runs `func` while the lock is +acquired and releases the lock again. +""" +function writer_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true) + if !acquire_lock + return func() + end + + warp_serialized() do + # Try to move the lock from 'idle' to 'write-acquired'. + while atomic_compare_exchange!(lock.state_ptr, 0, -max_rw_lock_readers) != 0 + end + + # We acquired the lock. Run the function. + result = func() + + # Release the lock by atomically adding `max_rw_lock_readers` to the + # lock's state. It's important that we use an atomic add instead of a + # simple store because a store might cause a race condition with `read_locked` + # that'll put us in a deadlock state. + atomic_add!(lock.state_ptr, max_rw_lock_readers) + + # We're done here. + return result + end +end + +# Gets the thread ID of the current thread. +@inline function get_thread_id() + return (blockIdx().x - 1) * blockDim().x + threadIdx().x +end + +# Gets the warp ID of the current thread. +@inline function get_warp_id() + return div(get_thread_id() - 1, warpsize()) + 1 +end + +const MutexState = UInt32 + +""" +A mutex: a lock that guarantees mutual exclusion. +""" +struct Mutex + # This GPU mutex implementation is based on + # Lock-based Synchronization for GPU Architectures + # by Yunlong Xu et al. + state_ptr::Ptr{MutexState} +end + +Mutex(state_ptr::DevicePtr{MutexState}) = + Mutex(unwrap_device_ptr(state_ptr)) + +""" + unlock(mutex::Mutex) + +Unlocks a mutex. +""" +function unlock(mutex::Mutex) + threadfence() + tid = get_thread_id() + atomic_compare_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1), UInt32(0)) + return +end + +""" + try_lock(mutex::Mutex)::Bool + +Tries to acquire a lock on a mutex. Returns `true` +if a lock was acquired successfully; otherwise, `false`. +""" +function try_lock(mutex::Mutex)::Bool + tid = UInt32(get_thread_id()) + wsize = warpsize() + threadbit = UInt32(1) << (tid % wsize) + + mask = vote_ballot(true) + + bitset = @cuStaticSharedMem(UInt32, 128) + bitset_ptr = unwrap_device_ptr(pointer(bitset)) + sizeof(UInt32) * div(threadIdx().x - 1, wsize) + unsafe_store!(bitset_ptr, UInt32(0)) + + lock = atomic_or!(mutex.state_ptr, UInt32(1)) + if lock & UInt32(1) == UInt32(0) + # The lock is free. + atomic_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1)) + else + pre_owner = lock >> 1 + if pre_owner != tid + if div(lock, wsize << 1) == div(tid, wsize) && pre_owner > tid && (((mask >> (pre_owner % wsize)) & UInt32(1)) == UInt32(1)) + atomic_or!(bitset_ptr, UInt32(1 << (pre_owner % wsize))) + atomic_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1)) + if (atomic_or!(mutex.state_ptr, UInt32(0)) >> 1) != tid + # Stealing failed. + atomic_or!(bitset_ptr, threadbit) + end + else + # Cannot steal. + atomic_or!(bitset_ptr, threadbit) + end + end + end + + if (unsafe_load(bitset_ptr) & threadbit) == UInt32(0) + threadfence() + return true + else + atomic_compare_exchange!(mutex.state_ptr, (tid << 1) + UInt32(1), UInt32(0)) + threadfence() + return false + end +end diff --git a/src/execution.jl b/src/execution.jl index 1783669a..34fc449e 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -8,8 +8,8 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize # split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and # the code it generates, or the execution function split_kwargs(kwargs) - macro_kws = [:dynamic] - compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name] + macro_kws = [:dynamic, :init, :gc_config] + compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :malloc, :gc] call_kws = [:cooperative, :blocks, :threads, :config, :shmem, :stream] macro_kwargs = [] compiler_kwargs = [] @@ -90,6 +90,9 @@ performed, scheduling a kernel launch on the current CUDA context. Several keyword arguments are supported that influence the behavior of `@cuda`. - `dynamic`: use dynamic parallelism to launch device-side kernels +- `gc`: set up a GC and use it to allocate memory; cannot be combined with `dynamic` +- `gc_config`: the GC configuration to use if `gc=true`; see [`GCConfiguration`](@ref) +- `malloc`: the name of the allocation function to use, if `gc` is not in use - arguments that influence kernel compilation: see [`cufunction`](@ref) and [`dynamic_cufunction`](@ref) - arguments that influence kernel launch: see [`CUDAnative.HostKernel`](@ref) and @@ -104,6 +107,7 @@ kernel to determine the launch configuration. A host-side kernel launch is done kernel_args = cudaconvert.(args) kernel_tt = Tuple{Core.Typeof.(kernel_args)...} kernel = cufunction(f, kernel_tt; compilation_kwargs) + prepare_kernel(kernel; environment_kwargs) kernel(kernel_args...; launch_kwargs) end @@ -132,20 +136,15 @@ macro cuda(ex...) args = call.args[2:end] code = quote end - macro_kwargs, compiler_kwargs, call_kwargs = split_kwargs(kwargs) + env_kwargs, compiler_kwargs, call_kwargs = split_kwargs(kwargs) vars, var_exprs = assign_args!(code, args) # handle keyword arguments that influence the macro's behavior - dynamic = false - for kwarg in macro_kwargs - key,val = kwarg.args - if key == :dynamic - isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant value")) - dynamic = val::Bool - else - throw(ArgumentError("Unsupported keyword argument '$key'")) - end - end + dynamic = get_kwarg_or_default(env_kwargs, :dynamic, false) + isa(dynamic, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant Boolean")) + + gc = get_kwarg_or_default(compiler_kwargs, :gc, false) + isa(gc, Bool) || throw(ArgumentError("`gc` keyword argument to @cuda should be a constant Boolean")) if dynamic # FIXME: we could probably somehow support kwargs with constant values by either @@ -153,14 +152,98 @@ macro cuda(ex...) # IR when processing the dynamic parallelism marker isempty(compiler_kwargs) || error("@cuda dynamic parallelism does not support compiler keyword arguments") + # FIXME: update the GC to support dynamic parallelism somehow. + !gc || error("@cuda does not support both `gc=true` and `dynamic=true`") + # dynamic, device-side kernel launch push!(code.args, quote # we're in kernel land already, so no need to cudaconvert arguments local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)} local kernel = dynamic_cufunction($(esc(f)), kernel_tt) + prepare_kernel(kernel; $(map(esc, env_kwargs)...)) kernel($(var_exprs...); $(map(esc, call_kwargs)...)) end) + elseif gc + # Find the stream on which the kernel is to be scheduled. + stream = get_kwarg_or_default(call_kwargs, :stream, CuDefaultStream()) + + # Get the total number of threads. + thread_count = get_kwarg_or_default(call_kwargs, :threads, 1) + + # Get the GC configuration. + config = get_kwarg_or_default(env_kwargs, :gc_config, GCConfiguration()) + + # GC-enabled host-side launch. + push!(code.args, + quote + GC.@preserve $(vars...) begin + # Define a trivial buffer that contains the interrupt state. + local interrupt_buffer = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, sizeof(ready), CUDAdrv.Mem.HOSTALLOC_DEVICEMAP) + local interrupt_pointer = Base.unsafe_convert(Ptr{UInt32}, interrupt_buffer) + unsafe_store!(interrupt_pointer, ready) + local device_interrupt_pointer = Base.unsafe_convert(CuPtr{UInt32}, interrupt_buffer) + + # Evaluate the GC configuration. + local gc_config = $(esc(config)) + + # Allocate a shared buffer for GC memory. + local gc_memory_size = initial_heap_size(gc_config, prod($(esc(thread_count)))) + local gc_heap = GCHeapDescription() + expand!(gc_heap, gc_memory_size) + local master_record = gc_init!(gc_heap, gc_config, prod($(esc(thread_count)))) + + # Define a kernel initialization function. + local function kernel_init(kernel) + # Set the interrupt state pointer. + try + global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer") + set(global_handle, device_interrupt_pointer) + catch exception + # The interrupt pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end + + # Set the GC master record. + try + global_handle = CuGlobal{GCMasterRecord}(kernel.mod, "gc_master_record") + set(global_handle, master_record) + catch exception + # The GC info pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end + end + + local gc_report = GCReport() + local function handle_interrupt() + gc_collect_impl(master_record, gc_heap, gc_config, gc_report) + end + + try + # Standard kernel setup logic. + local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),)) + local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} + local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; malloc="ptx_gc_malloc", $(map(esc, compiler_kwargs)...)) + CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...)) + gc_report.elapsed_time = Base.@elapsed begin + kernel(kernel_args...; $(map(esc, call_kwargs)...)) + + # Handle interrupts. + handle_interrupts(handle_interrupt, interrupt_pointer, $(esc(stream))) + end + finally + CUDAdrv.Mem.free(interrupt_buffer) + free!(gc_heap) + end + gc_report + end + end) else # regular, host-side kernel launch # @@ -173,6 +256,7 @@ macro cuda(ex...) local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} local kernel = cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) + prepare_kernel(kernel; $(map(esc, env_kwargs)...)) kernel(kernel_args...; $(map(esc, call_kwargs)...)) end end) @@ -447,9 +531,25 @@ end return ex end +""" + prepare_kernel(kernel::Kernel{F,TT}; init::Function=nop_init_kernel) + +Prepares a kernel for execution by setting up an environment for that kernel. +This function should be invoked just prior to running the kernel. Its +functionality is included in [`@cuda`](@ref). + +The 'init' keyword argument is a function that takes a kernel as argument and +sets up an environment for the kernel. +""" +function prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel, kw...) where {F,TT} + # Just call the 'init' function for now. + init(kernel) +end ## device-side API +# There doesn't seem to be a way to access the documentation for the call-syntax, +# so attach it to the type """ dynamic_cufunction(f, tt=Tuple{}) @@ -503,3 +603,8 @@ function nearest_warpsize(dev::CuDevice, threads::Integer) ws = CUDAdrv.warpsize(dev) return threads + (ws - threads % ws) % ws end + +function nop_init_kernel(kernel::AbstractKernel{F,TT}) where {F,TT} + # Do nothing. + return +end \ No newline at end of file diff --git a/src/gc.jl b/src/gc.jl new file mode 100644 index 00000000..0564097b --- /dev/null +++ b/src/gc.jl @@ -0,0 +1,1192 @@ +# This file contains a GC implementation for CUDAnative kernels. +# The sections below contain some basic info on how the garbage +# collector works. +# +# MEMORY ALLOCATION +# +# The GC's allocator uses free lists, i.e., the allocator maintains +# a list of all blocks that have not been allocated. Additionally, +# the allocator also maintains a list of all allocated blocks, so +# the collector knows which blocks it can free. +# +# GARBAGE COLLECTION +# +# The garbage collector itself is a semi-conservative, non-moving, +# mark-and-sweep, stop-the-world GC that runs on the host. +# The device may trigger the GC via an interrupt. +# +# The GC is semi-conservative in the sense that its set of roots +# is precise but objects are scanned in an imprecise way. +# +# After every garbage collection, the GC will compact free lists: +# adjacent free list block will be merged and the free list will +# be sorted based on block sizes to combat memory fragmentation. +# +# If a free list is deemed to be "starving" after a collection, i.e., +# its total amount of free bytes has dropped below some threshold, +# then a fresh chunk of GC-managed memory is allocated and added to +# the free list. +# +# SAFEPOINTS +# +# Every warp gets a flag that tells if that warp is in a safepoint. +# When a collection is triggered, the collector waits for every warp +# to reach a safepoint. The warps indicate that they have reached a +# safepoint by setting the flag. +# +# MISCELLANEOUS +# +# Some miscellaneous GPU-related GC implementation details: +# +# * GC memory is shared by the host and device. +# * Every thread gets a fixed region of memory for storing GC roots in. +# * When the device runs out of GC memory, it requests an interrupt +# to mark and sweep. + +export gc_malloc, gc_malloc_object, gc_collect, gc_safepoint, GCConfiguration + +import Base: length, show +import Printf: @sprintf + +# A data structure that precedes every chunk of memory that has been +# allocated or put into the free list. +struct FreeListRecord + # The size of the memory region this allocation record precedes. + # This size does not include the allocation record itself. + size::Csize_t + + # A pointer to the next allocation record in the list. If this + # allocation record is part of the free list, then this pointer + # points to the next free list entry; otherwise, it points to the + # next entry in the list of allocated blocks. + next::Ptr{FreeListRecord} +end + +@generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name} + index = Base.fieldindex(TBase, field_name) + offset = Base.fieldoffset(TBase, index) + type = Core.fieldtype(TBase, index) + :(Base.unsafe_convert(Ptr{$type}, base_pointer + $(offset))) +end + +# Gets a pointer to a particular field. +macro get_field_pointer(base_pointer, field_name) + :(get_field_pointer_impl($(esc(base_pointer)), Val($field_name))) +end + +# Gets a pointer to the first byte of data managed by an allocation record. +function data_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8} + Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(FreeListRecord) +end + +# Takes a pointer to the first byte of data managed by an allocation record +# and produces a pointer to the record itself. +function record_pointer(data::Ptr{UInt8})::Ptr{FreeListRecord} + Base.unsafe_convert(Ptr{FreeListRecord}, record) - sizeof(FreeListRecord) +end + +# Gets a pointer to the first byte of data no longer managed by an allocation record. +function data_end_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8} + data_pointer(record) + unsafe_load(@get_field_pointer(record, :size)) +end + +# A data structure that describes a single GC "arena", i.e., +# a section of the heap that is managed by the GC. Every arena +# has its own free list and allocation list. +struct FreeListArena + # The allocation lock for the arena. + lock_state::ReaderWriterLockState + + # The head of the free list. + free_list_head::Ptr{FreeListRecord} + + # The head of the allocation list. + allocation_list_head::Ptr{FreeListRecord} +end + +# Gets a free list arena's lock. +get_lock(arena::Ptr{FreeListArena}) = ReaderWriterLock(@get_field_pointer(arena, :lock_state)) + +const gc_align = Csize_t(16) + +# Aligns a pointer to an alignment boundary. +function align_downward(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T + address_int = Base.convert(Csize_t, address) + remainder = address_int % alignment + if remainder == Csize_t(0) + return address + else + return address + alignment - remainder + end +end + +# Aligns a pointer to an alignment boundary. +function align_upward(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T + result = align_downward(address, alignment) + if result < address + result += alignment + end + result +end + +# Aligns a pointer to an alignment boundary. +function align_upward(offset::T, alignment::Csize_t = gc_align)::T where T <: Integer + convert(T, Csize_t(align_upward(convert(Ptr{UInt8}, Csize_t(offset)), alignment))) +end + +# Gets the size of an aligned header, including padding to satisfy +# alignment requirements. +@generated function header_size(::Type{T}, ::Val{alignment} = Val(gc_align))::UInt32 where {T, alignment} + result = align_upward(UInt32(sizeof(T)), alignment) + :($result) +end + +# A reference to a Julia object. +const ObjectRef = Ptr{Nothing} + +# A GC frame is just a pointer to an array of Julia objects. +const GCFrame = Ptr{ObjectRef} + +# The states a safepoint flag can have. +@enum SafepointState::UInt32 begin + # Indicates that a warp is not in a safepoint. + not_in_safepoint = 0 + # Indicates that a warp is in a safepoint. This + # flag will be reset to `not_in_safepoint` by the + # collector on the next collecotr. + in_safepoint = 1 + # Indicates that a warp is in a perma-safepoint: + # the collector will not try to set this type + # of safepoint back to `not_in_safepoint`. + in_perma_safepoint = 2 +end + +const LocalArena = FreeListArena +const GlobalArena = FreeListArena + +# A data structure that contains global GC info. This data +# structure is designed to be immutable: it should not be changed +# once the host has set it up. +struct GCMasterRecord + # The number of warps. + warp_count::UInt32 + + # The number of threads. + thread_count::UInt32 + + # The maximum size of a GC root buffer, i.e., the maximum number + # of roots per thread. + root_buffer_capacity::UInt32 + + # The number of local arenas. + local_arena_count::UInt32 + + # A pointer to a list of local GC arena pointers. + local_arenas::Ptr{Ptr{LocalArena}} + + # A pointer to the global GC arena. + global_arena::Ptr{GlobalArena} + + # A pointer to a list of safepoint flags. Every warp has its + # own flag. + safepoint_flags::Ptr{SafepointState} + + # A pointer to a list of root buffer pointers that point to the + # end of the root buffer for every thread. + root_buffer_fingers::Ptr{Ptr{ObjectRef}} + + # A pointer to a list of buffers that can be used to store GC roots in. + # These root buffers are partitioned into GC frames later on. + root_buffers::Ptr{ObjectRef} +end + +# Iterates through all arena pointers stored in a GC master record. +@inline function iterate_arenas(fun::Function, master_record::GCMasterRecord) + for i in 1:master_record.local_arena_count + fun(unsafe_load(master_record.local_arenas, i)) + end + fun(master_record.global_arena) +end + +# Gets the global GC interrupt lock. +@inline function get_interrupt_lock()::ReaderWriterLock + return ReaderWriterLock(@cuda_global_ptr("gc_interrupt_lock", ReaderWriterLockState)) +end + +# Runs a function in such a way that no collection phases will +# run as long as the function is executing. Use with care: this +# macro acquires the GC interrupt lock in reader mode, so careless +# use may cause deadlocks. +macro nocollect(func) + quote + local @inline function lock_callback() + $(esc(func)) + end + + reader_locked(lock_callback, get_interrupt_lock()) + end +end + +# Gets the GC master record. +@inline function get_gc_master_record()::GCMasterRecord + return unsafe_load(@cuda_global_ptr("gc_master_record", GCMasterRecord)) +end + +# Gets a pointer to the local arena for this thread. This +# pointer may be null if there are no local arenas. +@inline function get_local_arena()::Ptr{LocalArena} + master_record = get_gc_master_record() + if master_record.local_arena_count == UInt32(0) + return Base.unsafe_convert(Ptr{LocalArena}, C_NULL) + else + return unsafe_load( + master_record.local_arenas, + ((get_warp_id() - 1) % master_record.local_arena_count) + 1) + end +end + +""" + new_gc_frame(size::UInt32)::GCFrame + +Allocates a new GC frame. +""" +@inline function new_gc_frame(size::UInt32)::GCFrame + master_record = get_gc_master_record() + # Return the root buffer tip: that's where the new GC frame starts. + return unsafe_load(master_record.root_buffer_fingers, get_thread_id()) +end + +""" + push_gc_frame(gc_frame::GCFrame, size::UInt32) + +Registers a GC frame with the garbage collector. +""" +@inline function push_gc_frame(gc_frame::GCFrame, size::UInt32) + master_record = get_gc_master_record() + + threadid = get_thread_id() + next_rootbuf_start = master_record.root_buffers + threadid * master_record.root_buffer_capacity * sizeof(Ptr{ObjectRef}) + new_rootbuf_finger = gc_frame + size * sizeof(ObjectRef) + + # Check that we have enough room to push the GC frame. + if new_rootbuf_finger >= next_rootbuf_start + @cuprintf("Root buffer overflow in thread %ld.\n", threadid) + return + end + + # Update the root buffer tip. + unsafe_store!( + master_record.root_buffer_fingers, + new_rootbuf_finger, + threadid) + return +end + +""" + pop_gc_frame(gc_frame::GCFrame) + +Deregisters a GC frame. +""" +@inline function pop_gc_frame(gc_frame::GCFrame) + master_record = get_gc_master_record() + + # Update the root buffer tip. + unsafe_store!( + master_record.root_buffer_fingers, + gc_frame, + get_thread_id()) + return +end + +""" + gc_safepoint() + +Signals that this warp has reached a GC safepoint. +""" +function gc_safepoint() + wait_for_interrupt() do + gc_set_safepoint_flag(in_safepoint; overwrite = false) + end + return +end + +""" + gc_perma_safepoint() + +Signals that this warp has reached a GC perma-safepoint: +the GC doesn't need to wait for this warp to reach a safepoint +before starting collections. Instead, the GC may assume that +the warp is already in a safepoint. + +Be careful with this function: all bets are off when this +function is used improperly. For a more controlled (but still +super dangerous) way to use perma-safepoints, see the +`@perma_safepoint` macro. +""" +function gc_perma_safepoint() + gc_set_safepoint_flag(in_perma_safepoint) + return +end + +# Sets this warp's safepoint flag to a particular state. +function gc_set_safepoint_flag(value::SafepointState; overwrite::Bool = true) + master_record = get_gc_master_record() + warp_id = get_warp_id() + safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointState) * (warp_id - 1) + if overwrite + volatile_store!(safepoint_flag_ptr, value) + else + atomic_compare_exchange!(safepoint_flag_ptr, not_in_safepoint, value) + end + return +end + +# Marks a region as a perma-safepoint: the entire region +# is a safepoint. Note that perma-safepoints are not allowed +# to include non-perma-safepoints. +macro perma_safepoint(expr) + quote + gc_perma_safepoint() + local result = $(esc(expr)) + gc_set_safepoint_flag(not_in_safepoint) + result + end +end + +# Tries to use a free-list entry to allocate a chunk of data of size `bytesize`, +# producing an appropriately-sized free list entry that prefixes the data. This +# entry is removed from the free list but not yet added to the allocation list. +function gc_take_list_entry( + entry_ptr::Ptr{Ptr{FreeListRecord}}, + entry::Ptr{FreeListRecord}, + bytesize::Csize_t)::Ptr{FreeListRecord} + + entry_data = unsafe_load(entry) + if entry_data.size < bytesize + # The entry is just too small. Return a `null` pointer. + return C_NULL + end + + # The entry's big enough, so we'll use it. If at all possible, we want + # to create a new entry from any unused memory in the entry. + + # Compute the address to return. + data_address = data_pointer(entry) + + # Compute the end of the free memory chunk. + end_address = data_address + entry_data.size + + # Compute the start address of the new free list entry. The data + # prefixed by the block needs to be aligned to a 16-byte boundary, + # but the block itself doesn't. + new_data_address = align_downward(data_address + bytesize) + new_entry_address = new_data_address - sizeof(FreeListRecord) + if new_entry_address < data_address + bytesize + new_entry_address += gc_align + new_data_address += gc_align + end + + # If we can place a new entry just past the allocation, then we should + # by all means do so. + if new_data_address < end_address + # Create a new free list entry. + new_entry_size = Csize_t(end_address) - Csize_t(new_data_address) + new_entry_ptr = Base.unsafe_convert(Ptr{FreeListRecord}, new_entry_address) + unsafe_store!( + new_entry_ptr, + FreeListRecord(new_entry_size, entry_data.next)) + + # Update this entry's `size` field to reflect the new entry's space + # requirements. + unsafe_store!( + @get_field_pointer(entry, :size)::Ptr{Csize_t}, + Csize_t(new_entry_address) - Csize_t(data_address)) + + # Update the free list pointer. + unsafe_store!(entry_ptr, new_entry_ptr) + else + # We can't create a new entry, but we still have to update the free + # list pointer. + unsafe_store!(entry_ptr, entry_data.next) + end + + return entry +end + +# Prepends a free list record to a free list. +function gc_add_to_free_list( + entry::Ptr{FreeListRecord}, + list_ptr::Ptr{Ptr{FreeListRecord}}) + + # Set the `next` pointer to the value stored at the allocation list pointer. + unsafe_store!( + @get_field_pointer(entry, :next)::Ptr{Ptr{FreeListRecord}}, + unsafe_load(list_ptr)) + + # Update the allocation list pointer to point to the entry. + unsafe_store!(list_ptr, entry) +end + +# Tries to allocate a chunk of memory from a free list. +# Returns a null pointer if no sufficiently large chunk of +# memory can be found. +# If the result is non-null, then a free list record is +# returned that has been taken from the free list but not +# yet added to another list. +function gc_take_any_list_entry( + free_list_ptr::Ptr{Ptr{FreeListRecord}}, + bytesize::Csize_t)::Ptr{FreeListRecord} + + # To allocate memory, we will walk the free list until we find a suitable candidate. + while true + free_list_item = unsafe_load(free_list_ptr) + + if free_list_item == C_NULL + return C_NULL + end + + result = gc_take_list_entry(free_list_ptr, free_list_item, bytesize) + if result != C_NULL + return result + end + + free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{FreeListRecord}} + end +end + +# Tries to allocate a chunk of memory from a free list. +# Returns a null pointer if no sufficiently large chunk of +# memory can be found. +# +# This function is not thread-safe. +function gc_malloc_from_free_list(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8} + free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{FreeListRecord}} + allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{FreeListRecord}} + + # Try to take the entry out of the free list. + result_entry = gc_take_any_list_entry(free_list_ptr, bytesize) + if result_entry == C_NULL + # The entry is just too small. Return a `null` pointer. + return C_NULL + end + + # At this point, all we need to do is update the allocation record to + # reflect the fact that it now represents an allocated block instead of + # a free block. + gc_add_to_free_list(result_entry, allocation_list_ptr) + + return data_pointer(result_entry) +end + +# Writes a pointer to a temporary GC frame. This will keep the pointer +# from getting collected until the caller has a chance to add it to its +# own GC frame. +function gc_protect(pointer::Ptr{UInt8}) + if pointer != Base.unsafe_convert(Ptr{UInt8}, C_NULL) + gc_frame = new_gc_frame(UInt32(1)) + unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, pointer)) + end +end + +# Tries to allocate a chunk of memory in a particular GC arena. +# Returns a null pointer if no sufficiently large chunk of +# memory can be found. +function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t; acquire_lock=true)::Ptr{UInt8} + # Acquire the arena's lock. + result_ptr = writer_locked(get_lock(arena); acquire_lock=acquire_lock) do + # Allocate a suitable region of memory. + gc_malloc_from_free_list(arena, bytesize) + end + + # If the resulting pointer is non-null, then we'll write it to a temporary GC frame. + # Our reasoning for doing this is that doing so ensures that the allocated memory + # won't get collected by the GC before the caller has a chance to add it to its + # own GC frame. + gc_protect(result_ptr) + return result_ptr +end + +# Transfers a block of free memory from one arena to another and then +# allocates a differently-sized block of memory from the destination +# arena. +function gc_transfer_and_malloc( + from_arena::Ptr{FreeListArena}, + to_arena::Ptr{FreeListArena}, + transfer_bytesize::Csize_t, + alloc_bytesize::Csize_t)::Ptr{UInt8} + + from_free_list = @get_field_pointer(from_arena, :free_list_head)::Ptr{Ptr{FreeListRecord}} + entry = writer_locked(get_lock(from_arena)) do + # Try to take the entry out of the free list. + gc_take_any_list_entry(from_free_list, transfer_bytesize) + end + + if entry == C_NULL + return C_NULL + else + to_free_list = @get_field_pointer(to_arena, :free_list_head)::Ptr{Ptr{FreeListRecord}} + return writer_locked(get_lock(to_arena)) do + gc_add_to_free_list(entry, to_free_list) + gc_malloc_local(to_arena, alloc_bytesize; acquire_lock=false) + end + end +end + +""" + gc_malloc(bytesize::Csize_t)::Ptr{UInt8} + +Allocates a blob of memory that is managed by the garbage collector. +This function is designed to be called by the device. +""" +function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} + master_record = get_gc_master_record() + + function allocate() + # Try to allocate in the local arena second. If that doesn't + # work, we'll move on to the global arena, which is bigger but + # is shared by all threads. (We want to minimize contention + # on the global arena's lock.) + local_arena = get_local_arena() + if local_arena != C_NULL + local_ptr = gc_malloc_local(local_arena, bytesize) + if local_ptr != C_NULL + return local_ptr + end + else + # If there is no local arena then we will just have to allocate + # from the global arena directly. + return gc_malloc_local(master_record.global_arena, bytesize) + end + + # Try to use the global arena if all else fails, but only if the chunk + # of memory we want to allocate is sufficiently large. Allocating lots of + # small chunks in the global arena will result in undue contention and slow + # down kernels dramatically. + # + # If we need to allocate a small chunk of memory but the local arena is + # empty, then we will transfer a *much* larger chunk of memory from the global + # arena to the local arena. After that we'll allocate in the local arena. + min_global_alloc_size = Csize_t(256 * (1 << 10)) + if bytesize >= min_global_alloc_size + local_ptr = gc_malloc_local(master_record.global_arena, bytesize) + else + local_ptr = gc_transfer_and_malloc( + master_record.global_arena, + local_arena, + min_global_alloc_size, + bytesize) + end + return local_ptr + end + + # Try to malloc the object without host intervention. + ptr = @perma_safepoint @nocollect allocate() + if ptr != C_NULL + return ptr + end + + # We're out of memory, which means that we need the garbage collector + # to step in. Set a perma-safepoint and acquire the interrupt lock. + ptr = @perma_safepoint writer_locked(get_interrupt_lock()) do + # Try to allocate memory again. This is bound to fail for the + # first thread that acquires the interrupt lock, but it is quite + # likely to succeed if we are *not* in the first thread that + # acquired the garbage collector lock. + ptr2 = allocate() + + if ptr2 == C_NULL + # We are either the first thread to acquire the interrupt lock + # or the additional memory produced by a previous collection has + # already been exhausted. Trigger the garbage collector. + gc_collect_impl() + + # Try to malloc again. + ptr2 = gc_malloc_local(master_record.global_arena, bytesize) + end + ptr2 + end + if ptr != C_NULL + return ptr + end + + # Alright, so that was a spectacular failure. Let's just throw an exception. + @cuprintf("ERROR: Out of GPU GC memory (trying to allocate %i bytes)\n", bytesize) + # throw(OutOfMemoryError()) + return C_NULL +end + +""" + gc_malloc_object(bytesize::Csize_t) + +Allocates an object that is managed by the garbage collector. +This function is designed to be called by the device. +""" +function gc_malloc_object(bytesize::Csize_t) + unsafe_pointer_to_objref(gc_malloc(bytesize)) +end + +# Zero-fills a range of memory. +function zero_fill!(start_ptr::Ptr{UInt8}, size::Csize_t) + ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), start_ptr, 0, size) +end + +# Zero-fills a range of memory. +function zero_fill!(start_ptr::Ptr{UInt8}, end_ptr::Ptr{UInt8}) + zero_fill!(start_ptr, Csize_t(end_ptr) - Csize_t(start_ptr)) +end + +# Tries to free a block of memory from a particular arena. `record_ptr` +# must point to a pointer to the GC allocation record to free. It will +# be updated to point to the next allocation. +# +# This function is designed to be called by the host: it does not +# turn off collections. It can be called by the device, but in that +# case it should be prefixed by the `@nocollect` macro followed by +# a write lock acquisition on the arena's lock. +function gc_free_local( + arena::Ptr{FreeListArena}, + record_ptr::Ptr{Ptr{FreeListRecord}}) + + record = unsafe_load(record_ptr) + next_record_ptr = @get_field_pointer(record, :next) + free_list_head_ptr = @get_field_pointer(arena, :free_list_head) + + # Remove the record from the allocation list. + unsafe_store!(record_ptr, unsafe_load(next_record_ptr)) + + # Add the record to the free list and update its `next` pointer + # (but not in that order). + unsafe_store!(next_record_ptr, unsafe_load(free_list_head_ptr)) + unsafe_store!(free_list_head_ptr, record) + + # Zero-fill the newly freed block of memory. + zero_fill!(data_pointer(record), unsafe_load(@get_field_pointer(record, :size))) +end + +# Like 'gc_collect', but does not acquire the interrupt lock. +function gc_collect_impl() + interrupt_or_wait() + threadfence_system() +end + +""" + gc_collect() + +Triggers a garbage collection phase. This function is designed +to be called by the device rather than by the host. +""" +function gc_collect() + writer_locked(gc_collect_impl, get_interrupt_lock()) +end + +# One megabyte. +const MiB = 1 << 20 + +# A description of a region of memory that has been allocated to the GC heap. +const GCHeapRegion = CUDAdrv.Mem.HostBuffer + +# A description of all memory that has been allocated to the GC heap. +struct GCHeapDescription + # A list of the set of regions that comprise the GC heap. + regions::Array{GCHeapRegion, 1} +end + +GCHeapDescription() = GCHeapDescription([]) + +# A data structure that contains GC configuration parameters. +struct GCConfiguration + # The number of local arenas to create. + local_arena_count::Int + + # The max number of roots that can be stored per thread. + root_buffer_capacity::Int + + # The point at which the global arena is deemed to be starving, i.e., + # it no longer contains enough memory to perform basic allocations. + # If the global arena's free byte count stays below the arena starvation + # threshold after a collection phase, the collector will allocate + # additional memory to the arena such that it is no longer starving. + global_arena_starvation_threshold::Int + + # The initial size of the global arena, in bytes. + global_arena_initial_size::Int + + # The point at which a local arena is deemed to be starving, i.e., + # it no longer contains enough memory to perform basic allocations. + # If a local arena's free byte count stays below the arena starvation + # threshold after a collection phase, the collector will allocate + # additional memory to the arena such that it is no longer starving. + local_arena_starvation_threshold::Int + + # The initial size of a local arena, in bytes. + local_arena_initial_size::Int +end + +# Creates a GC configuration. +function GCConfiguration(; + local_arena_count::Integer = 8, + root_buffer_capacity::Integer = 256, + global_arena_starvation_threshold::Integer = 4 * MiB, + global_arena_initial_size::Integer = 2 * MiB, + local_arena_starvation_threshold::Integer = 1 * MiB, + local_arena_initial_size::Integer = 1 * MiB) + + GCConfiguration( + local_arena_count, + root_buffer_capacity, + global_arena_starvation_threshold, + global_arena_initial_size, + local_arena_starvation_threshold, + local_arena_initial_size) +end + +function initial_heap_size(config::GCConfiguration, thread_count::Integer) + warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device())) + local_arenas_bytesize = sizeof(Ptr{LocalArena}) * config.local_arena_count + safepoint_bytesize = sizeof(SafepointState) * warp_count + fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count + rootbuf_bytesize = sizeof(ObjectRef) * config.root_buffer_capacity * thread_count + + result = 0 + result += local_arenas_bytesize + result += safepoint_bytesize + result += fingerbuf_bytesize + result += rootbuf_bytesize + result += config.local_arena_count * config.local_arena_initial_size + result += config.global_arena_initial_size + return result +end + +# Initializes a GC heap and produces a master record. +function gc_init!( + heap::GCHeapDescription, + config::GCConfiguration, + thread_count::Integer)::GCMasterRecord + + warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device())) + + master_region = heap.regions[1] + + gc_memory_start_ptr = pointer(master_region) + gc_memory_end_ptr = pointer(master_region) + sizeof(master_region) + + # Allocate a local arena pointer buffer. + local_arenas_bytesize = sizeof(Ptr{LocalArena}) * config.local_arena_count + local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{LocalArena}}, gc_memory_start_ptr) + + # Allocate the safepoint flag buffer. + safepoint_bytesize = sizeof(SafepointState) * warp_count + safepoint_ptr = Base.unsafe_convert(Ptr{SafepointState}, local_arenas_ptr + local_arenas_bytesize) + + # Allocate root buffers. + fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count + fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, safepoint_ptr + fingerbuf_bytesize) + rootbuf_bytesize = sizeof(ObjectRef) * config.root_buffer_capacity * thread_count + rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, fingerbuf_ptr + fingerbuf_bytesize) + + # Populate the root buffer fingers. + for i in 1:thread_count + unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * config.root_buffer_capacity, i) + end + + # Compute a pointer to the start of the tiny arena. + arena_start_ptr = rootbuf_ptr + rootbuf_bytesize + + # Set up local arenas. + for i in 1:config.local_arena_count + local_arena = make_gc_arena!(LocalArena, arena_start_ptr, Csize_t(config.local_arena_initial_size)) + unsafe_store!(local_arenas_ptr, local_arena, i) + arena_start_ptr += config.local_arena_initial_size + end + + # Set up the global arena. + global_arena = make_gc_arena!(GlobalArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr)) + + return GCMasterRecord( + warp_count, + UInt32(thread_count), + UInt32(config.root_buffer_capacity), + UInt32(config.local_arena_count), + local_arenas_ptr, + global_arena, + safepoint_ptr, + fingerbuf_ptr, + rootbuf_ptr) +end + +# Takes a zero-filled region of memory and turns it into a block +# managed by the GC, prefixed with an allocation record. +function make_gc_block!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListRecord} where T + entry = Base.unsafe_convert(Ptr{FreeListRecord}, start_ptr) + unsafe_store!( + entry, + FreeListRecord( + Csize_t(start_ptr + size) - Csize_t(data_pointer(entry)), + C_NULL)) + return entry +end + +# Takes a zero-filled region of memory and turns it into an arena +# managed by the GC, prefixed with an arena record. +function make_gc_arena!(::Type{FreeListArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListArena} where T + # Create a single free list entry. + first_entry_ptr = make_gc_block!(start_ptr + sizeof(FreeListArena), size - sizeof(FreeListArena)) + + # Set up the arena record. + arena = Base.unsafe_convert(Ptr{FreeListArena}, start_ptr) + unsafe_store!( + arena, + FreeListArena(0, first_entry_ptr, C_NULL)) + + arena +end + +# Tells if a GC heap contains a particular pointer. +function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T + for region in heap.regions + if pointer >= pointer(region) && pointer < pointer(region) + sizeof(region) + return true + end + end + return false +end + +# Expands the GC heap by allocating a region of memory and adding it to +# the list of allocated regions. `size` describes the amount of bytes to +# allocate. Returns the allocated region. +function expand!(heap::GCHeapDescription, size::Integer)::GCHeapRegion + region = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, size, CUDAdrv.Mem.HOSTALLOC_DEVICEMAP) + push!(heap.regions, region) + return region +end + +# Frees all memory allocated by a GC heap. +function free!(heap::GCHeapDescription) + for region in heap.regions + CUDAdrv.Mem.free(region) + end +end + +# A sorted list of all allocation records for allocated blocks. +# This data structure is primarily useful for rapidly mapping +# pointers to the blocks allocated blocks that contain them. +struct SortedAllocationList + # An array of pointers to allocation records. The pointers + # are all sorted. + records::Array{Ptr{FreeListRecord}, 1} +end + +length(alloc_list::SortedAllocationList) = length(alloc_list.records) + +# Gets a pointer to the allocation record that manages the memory +# pointed to by `pointer`. Returns a null pointer if there is no +# such record. +function get_record( + alloc_list::SortedAllocationList, + pointer::Ptr{T})::Ptr{FreeListRecord} where T + + # Deal with these cases quickly so we can assume that the + # free list is nonempty. + if length(alloc_list) == 0 || + pointer < data_pointer(alloc_list.records[1]) || + pointer >= data_end_pointer(alloc_list.records[end]) + + return C_NULL + end + + # To quickly narrow down the search space, we will do a binary search + # for the biggest allocation record pointer that is smaller than `pointer`. + range_start, range_end = 1, length(alloc_list) + while range_end - range_start > 4 + range_mid = div(range_start + range_end, 2) + mid_val = alloc_list.records[range_mid] + if mid_val > pointer + range_end = range_mid + else + range_start = range_mid + end + end + + # Make sure that the pointer actually points to a region of memory + # that is managed by the candidate record we found. + for record in alloc_list.records[range_start:range_end] + if pointer >= data_pointer(record) && pointer < data_end_pointer(record) + return record + end + end + return C_NULL +end + +# Iterates through a linked list of allocation records and apply a function +# to every node in the linked list. +function iterate_allocation_records(fun::Function, head::Ptr{FreeListRecord}) + while head != C_NULL + fun(head) + head = unsafe_load(head).next + end +end + +# Iterates through all active allocation records in a GC arena. +function iterate_allocated(fun::Function, arena::Ptr{FreeListArena}) + allocation_list_head = unsafe_load(arena).allocation_list_head + iterate_allocation_records(fun, allocation_list_head) +end + +# Iterates through all free allocation records in a GC arena. +function iterate_free(fun::Function, arena::Ptr{FreeListArena}) + free_list_head = unsafe_load(arena).free_list_head + iterate_allocation_records(fun, free_list_head) +end + +# Takes a GC master record and constructs a sorted allocation list +# based on it. +function sort_allocation_list(master_record::GCMasterRecord)::SortedAllocationList + records = [] + iterate_arenas(master_record) do arena + iterate_allocated(arena) do record + push!(records, record) + end + end + sort!(records) + return SortedAllocationList(records) +end + +# Frees all dead blocks in an arena. +function gc_free_garbage(arena::Ptr{FreeListArena}, live_blocks::Set{Ptr{FreeListRecord}}) + record_ptr = @get_field_pointer(arena, :allocation_list_head) + while true + record = unsafe_load(record_ptr) + if record == C_NULL + # We've reached the end of the list. + break + end + + if record in live_blocks + # We found a live block. Proceed to the next block. + record_ptr = @get_field_pointer(record, :next) + else + # We found a dead block. Release it. Don't proceed to the + # next block because the current block will change in the + # next iteration of this loop. + gc_free_local(arena, record_ptr) + end + end +end + +# Compact a GC arena's free list. This function will +# 1. merge adjancent free blocks, and +# 2. reorder free blocks to put small blocks at the front +# of the free list, +# 3. tally the total number of free bytes and return that number. +function gc_compact(arena::Ptr{FreeListArena})::Csize_t + # Let's start by creating a list of all free list records. + records = Ptr{FreeListRecord}[] + iterate_free(arena) do record + push!(records, record) + end + + # We now sort those records and loop through the sorted list, + # merging free list entries as we go along. + sort!(records) + + i = 1 + while i < length(records) + first_record = records[i] + second_record = records[i + 1] + if data_end_pointer(first_record) == Base.unsafe_convert(Ptr{UInt8}, second_record) + # We found two adjacent free list entries. Expand the first + # record's size to encompass both entries, zero-fill the second + # record's header and delete it from the list of records. + new_size = Csize_t(data_end_pointer(second_record)) - Csize_t(data_pointer(first_record)) + zero_fill!(data_end_pointer(first_record), data_pointer(second_record)) + unsafe_store!(@get_field_pointer(first_record, :size), new_size) + deleteat!(records, i + 1) + else + i += 1 + end + end + + # Now sort the records based on size. Put the smallest records first to + # discourage fragmentation. + sort!(records; lt = (x, y) -> unsafe_load(x).size < unsafe_load(y).size) + + # Reconstruct the free list as a linked list. + prev_record_ptr = @get_field_pointer(arena, :free_list_head) + for record in records + unsafe_store!(prev_record_ptr, record) + prev_record_ptr = @get_field_pointer(record, :next) + end + unsafe_store!(prev_record_ptr, C_NULL) + + # Compute the total number of free bytes. + return sum(map(record -> unsafe_load(record).size, records)) +end + +# Expands a GC arena by assigning it an additional heap region. +function gc_expand(arena::Ptr{FreeListArena}, region::GCHeapRegion) + extra_record = make_gc_block!(pointer(region), Csize_t(sizeof(region))) + last_free_list_ptr = @get_field_pointer(arena, :free_list_head) + iterate_free(arena) do record + last_free_list_ptr = @get_field_pointer(record, :next) + end + unsafe_store!(last_free_list_ptr, extra_record) +end + +"""A report of the GC's actions.""" +mutable struct GCReport + """The total wall-clock time of a kernel execution.""" + elapsed_time::Float64 + + """The number of collections that were performed.""" + collection_count::Int + + """The total wall-clock time of all collection polls.""" + collection_poll_time::Float64 + + """The total wall-clock time of all collections.""" + collection_time::Float64 + + """The total amount of additional memory allocated to local pools.""" + extra_local_memory::Csize_t + + """The total amount of additional memory allocated to the global pool.""" + extra_global_memory::Csize_t + + GCReport() = new(0.0, 0, 0.0, 0.0, Csize_t(0), Csize_t(0)) +end + +function show(io::IO, report::GCReport) + print(io, "[wall-clock time: $(@sprintf("%.4f", report.elapsed_time)) s; ") + print(io, "collections: $(report.collection_count); ") + poll_percentage = 100 * report.collection_poll_time / report.elapsed_time + print(io, "total poll time: $(@sprintf("%.4f", report.collection_poll_time)) s ($(@sprintf("%.2f", poll_percentage))%); ") + collection_percentage = 100 * report.collection_time / report.elapsed_time + print(io, "total collection time: $(@sprintf("%.4f", report.collection_time)) s ($(@sprintf("%.2f", collection_percentage))%); ") + print(io, "extra local memory: $(div(report.extra_local_memory, MiB)) MiB; ") + print(io, "extra global memory: $(div(report.extra_global_memory, MiB)) MiB]") +end + +# Collects garbage. This function is designed to be called by the host, +# not by the device. +function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, config::GCConfiguration, report::GCReport) + poll_time = Base.@elapsed begin + # First off, we have to wait for all warps to reach a safepoint. Clear + # safepoint flags and wait for warps to set them again. + for i in 0:(master_record.warp_count - 1) + atomic_compare_exchange!( + master_record.safepoint_flags + i * sizeof(SafepointState), + in_safepoint, + not_in_safepoint) + end + safepoint_count = 0 + while safepoint_count != master_record.warp_count + safepoint_count = 0 + for i in 0:(master_record.warp_count - 1) + state = volatile_load(master_record.safepoint_flags + i * sizeof(SafepointState)) + if state != not_in_safepoint + safepoint_count += 1 + end + end + end + end + + collection_time = Base.@elapsed begin + + # The Julia CPU GC is precise and the information it uses for precise + # garbage collection is stored in memory that we should be able to access. + # However, the way the CPU GC stores field information is incredibly + # complicated and replicating that logic here would be a royal pain to + # implement and maintain. Ideally, the CPU GC would expose an interface that + # allows us to point to an object and ask the GC for all GC-tracked pointers + # it contains. Alas, no such luck: the CPU GC doesn't even have an internal + # function that does that. The CPU GC's logic for finding GC-tracked pointer + # fields is instead fused tightly with its 'mark' loop. + # + # To cope with this, we will simply implement a semi-conservative GC: we precisely + # scan the roots for pointers into the GC heap. We then recursively mark blocks + # that are pointed to by such pointers as live and conservatively scan them for + # more pointers. + # + # Our mark phase is fairly simple: we maintain a worklist of pointers that + # are live and may need to be processed, as well as a set of blocks that are + # live and have already been processed. + live_blocks = Set{Ptr{FreeListRecord}}() + live_worklist = Ptr{ObjectRef}[] + + # Get a sorted allocation list, which will allow us to classify live pointers quickly. + alloc_list = sort_allocation_list(master_record) + + # Add all roots to the worklist. + for i in 1:(master_record.root_buffer_capacity * master_record.thread_count) + root = unsafe_load(master_record.root_buffers, i) + if root != C_NULL + push!(live_worklist, root) + end + end + + # Now process all live pointers until we reach a fixpoint. + while !isempty(live_worklist) + # Pop a pointer from the worklist. + object_ref = pop!(live_worklist) + # Get the block for that pointer. + record = get_record(alloc_list, object_ref) + # Make sure that we haven't visited the block yet. + if record != C_NULL && !(record in live_blocks) + # Mark the block as live. + push!(live_blocks, record) + # Add all pointer-sized, aligned values to the live pointer worklist. + for ptr in data_pointer(record):sizeof(ObjectRef):data_end_pointer(record) - 1 + value = unsafe_load(Base.unsafe_convert(Ptr{ObjectRef}, ptr)) + push!(live_worklist, value) + end + end + end + + # We're done with the mark phase! Time to proceed to the sweep phase. + # The first thing we'll do is iterate through every arena's allocation list and + # free dead blocks. Next, we will compact and reorder free lists to combat + # fragmentation. + iterate_arenas(master_record) do arena + # Free garbage blocks. + gc_free_garbage(arena, live_blocks) + + # Compact the arena. + free_memory = gc_compact(arena) + + # If the amount of free memory in the arena is below the starvation + # limit then we'll expand the GC heap and add the additional memory + # to the arena's free list. + threshold = if arena == master_record.global_arena + config.global_arena_starvation_threshold + else + config.local_arena_starvation_threshold + end + + if free_memory < threshold + region = expand!(heap, threshold) + gc_expand(arena, region) + + if arena == master_record.global_arena + report.extra_global_memory += Csize_t(threshold) + else + report.extra_local_memory += Csize_t(threshold) + end + end + end + end + report.collection_count += 1 + report.collection_time += collection_time + report.collection_poll_time += poll_time +end + +# Examines a keyword argument list and gets either the value +# assigned to a key or a default value. +function get_kwarg_or_default(kwarg_list, key::Symbol, default) + for kwarg in kwarg_list + arg_key, val = kwarg.args + if arg_key == key + return val + end + end + return default +end diff --git a/src/interrupts.jl b/src/interrupts.jl new file mode 100644 index 00000000..5251af4d --- /dev/null +++ b/src/interrupts.jl @@ -0,0 +1,252 @@ +# This file implements a high-level generic device-to-host interrupt +# mechanism. This file also contains non-trivial support infrastructure +# that should either be moved to CUDAdrv or exposed by CUDAnative. +# Note that this support infrastructure is not exported, so it remains +# an implementation detail as opposed to a part of CUDAnative's public +# API. + +import CUDAdrv: @apicall + +export @cuda_interruptible, interrupt, interrupt_or_wait, wait_for_interrupt + +# Queries a stream for its status. +function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint + return ccall( + (:cuStreamQuery, CUDAdrv.libcuda), + Cint, + (CUDAdrv.CuStream_t,), + stream) +end + +# Gets a pointer to a global with a particular name. If the global +# does not exist yet, then it is declared in the global memory address +# space. +@generated function get_global_pointer(::Val{global_name}, ::Type{T})::Ptr{T} where {global_name, T} + T_global = convert(LLVMType, T) + T_result = convert(LLVMType, Ptr{T}) + + # Create a thunk that computes a pointer to the global. + llvm_f, _ = create_function(T_result) + mod = LLVM.parent(llvm_f) + + # Figure out if the global has been defined already. + global_set = LLVM.globals(mod) + global_name_string = String(global_name) + if haskey(global_set, global_name_string) + global_var = global_set[global_name_string] + else + # If the global hasn't been defined already, then we'll define + # it in the global address space, i.e., address space one. + global_var = GlobalVariable(mod, T_global, global_name_string, 1) + linkage!(global_var, LLVM.API.LLVMLinkOnceAnyLinkage) + initializer!(global_var, LLVM.null(T_global)) + end + + # Generate IR that computes the global's address. + Builder(JuliaContext()) do builder + entry = BasicBlock(llvm_f, "entry", JuliaContext()) + position!(builder, entry) + + # Cast the global variable's type to the result type. + result = ptrtoint!(builder, global_var, T_result) + ret!(builder, result) + end + + # Call the function. + call_function(llvm_f, Ptr{T}) +end + +macro cuda_global_ptr(name, type) + return :(get_global_pointer( + $(Val(Symbol(name))), + $(esc(type)))) +end + +# Gets a pointer to the interrupt region. +@inline function get_interrupt_pointer()::Ptr{UInt32} + # Compute a pointer to the global in which a pointer to the + # interrupt state is stored. + ptr = @cuda_global_ptr("interrupt_pointer", Ptr{UInt32}) + # state the pointer, netting us a pointer to the interrupt + # region. + return Base.unsafe_load(ptr) +end + +# The interrupt state is a 32-bit unsigned integer that +# can have one of the following values: +# +# * 0: host is ready to process an interrupt, no interrupt +# is currently being processed. +# * 1: device has requested an interrupt, the interrupt +# has not completed processing yet. +# +const ready = UInt32(0) +const processing = UInt32(1) + +""" + interrupt_or_wait() + +Requests an interrupt and waits until the interrupt completes. +If an interrupt is already running, then this function waits +for that interrupt to complete, but does not request an interrupt +of its own. Returns `true` if an interrupt was successfully +requested by this function; otherwise, `false`. +""" +function interrupt_or_wait()::Bool + state_ptr = get_interrupt_pointer() + prev_state = atomic_compare_exchange!(state_ptr, ready, processing) + wait_for_interrupt() + return prev_state == ready +end + +""" + wait_for_interrupt(fun::Function) + +Waits for the current interrupt to finish, if an interrupt is +currently running. A function is repeatedly executed until the +interrupt finishes. +""" +function wait_for_interrupt(fun::Function) + state_ptr = get_interrupt_pointer() + while volatile_load(state_ptr) == processing + fun() + end +end + +""" + wait_for_interrupt() + +Waits for the current interrupt to finish, if an interrupt is +currently running. +""" +function wait_for_interrupt() + wait_for_interrupt() do + end +end + +""" + interrupt() + +Repeatedly requests an interrupt until one is requested successfully. +""" +function interrupt() + while !interrupt_or_wait() + end +end + +# Waits for the current kernel to terminate and handle +# any interrupts that we encounter along the way. +function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CuStream = CuDefaultStream()) + while true + # Sleep to save processing power. + sleep(0.001) + + # Query the CUDA stream. + status = query_stream(stream) + if status == CUDAdrv.SUCCESS.code + # The kernel has finished running. We're done here. + return + elseif status == CUDAdrv.ERROR_NOT_READY.code + # The kernel is still running. Check if an interrupt + # needs handling. + if volatile_load(state) == processing + # Run the handler. + handler() + # Set the interrupt state to 'ready'. + volatile_store!(state, ready) + end + + # Continue querying the stream. + else + # Whoa. Something both unexpected and unpleasant seems + # to have happened. Better throw an exception here. + throw(CuError(status)) + end + end +end + +""" + @cuda_interruptible [kwargs...] func(args...) + +High-level interface for executing code on a GPU with support for interrupts. +The `@cuda_interruptible` macro should prefix a call, with `func` a callable function +or object that should return nothing. It will be compiled to a CUDA function upon first +use, and to a certain extent arguments will be converted and anaged automatically using +`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel +launch on the current CUDA context. + +Several keyword arguments are supported that influence kernel compilation and execution. For +more information, refer to the documentation of respectively [`cufunction`](@ref) and +[`CUDAnative.Kernel`](@ref). +""" +macro cuda_interruptible(handler, ex...) + # destructure the `@cuda_interruptible` expression + if length(ex) > 0 && ex[1].head == :tuple + error("The tuple argument to @cuda has been replaced by keywords: `@cuda_interruptible handler threads=... fun(args...)`") + end + call = ex[end] + kwargs = ex[1:end-1] + + # destructure the kernel call + if call.head != :call + throw(ArgumentError("second argument to @cuda_interruptible should be a function call")) + end + f = call.args[1] + args = call.args[2:end] + + code = quote end + env_kwargs, compiler_kwargs, call_kwargs = CUDAnative.split_kwargs(kwargs) + vars, var_exprs = CUDAnative.assign_args!(code, args) + + # Find the stream on which the kernel is to be scheduled. + stream = CuDefaultStream() + for kwarg in call_kwargs + key, val = kwarg.args + if key == :stream + stream = val + end + end + + # convert the arguments, call the compiler and launch the kernel + # while keeping the original arguments alive + push!(code.args, + quote + GC.@preserve $(vars...) begin + # Define a trivial buffer that contains the interrupt state. + local interrupt_buffer = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, sizeof(ready), CUDAdrv.Mem.HOSTALLOC_DEVICEMAP) + local interrupt_pointer = Base.unsafe_convert(Ptr{UInt32}, interrupt_buffer) + unsafe_store!(interrupt_pointer, ready) + local device_interrupt_pointer = Base.unsafe_convert(CuPtr{UInt32}, interrupt_buffer) + + try + # Define a kernel initialization function that sets the + # interrupt state pointer. + local function interrupt_kernel_init(kernel) + try + global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer") + set(global_handle, device_interrupt_pointer) + catch exception + # The interrupt pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end + end + + # Standard kernel setup logic. + local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),)) + local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} + local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) + CUDAnative.prepare_kernel(kernel; init=interrupt_kernel_init, $(map(esc, env_kwargs)...)) + kernel(kernel_args...; $(map(esc, call_kwargs)...)) + + # Handle interrupts. + handle_interrupts($(esc(handler)), interrupt_pointer, $(esc(stream))) + finally + CUDAdrv.Mem.free(interrupt_buffer) + end + end + end) + return code +end diff --git a/test/device/gc.jl b/test/device/gc.jl new file mode 100644 index 00000000..640d5ebf --- /dev/null +++ b/test/device/gc.jl @@ -0,0 +1,70 @@ +@testset "gc" begin + +############################################################################################ + +dummy() = return + +dummy_handler(kernel) = return + +@testset "@cuda gc=true" begin + +@testset "allocate and collect" begin + # This test allocates many very small and very large objects. Both the small + # and large objects become garbage eventually, but small objects need to + # outlive the large objects (and not be collected erroneously) for the test + # to pass. So essentially this test tackles three things: + # + # 1. Allocation works. + # 2. Collection works. + # 3. Collection isn't gung-ho to the point of incorrectness. + # + + mutable struct TempStruct + data::Float32 + end + + @noinline function escape(val) + Base.pointer_from_objref(val) + end + + # Define a kernel that copies values using a temporary struct. + function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + for j in 1:2 + # Allocate a mutable struct and make sure it ends up on the GC heap. + temp = TempStruct(unsafe_load(a, i)) + escape(temp) + + # Allocate a large garbage buffer to force collections. + gc_malloc(Csize_t(256 * 1024)) + + # Use the mutable struct. If its memory has been reclaimed (by accident) + # then we expect the test at the end of this file to fail. + unsafe_store!(b, temp.data, i) + end + + return + end + + thread_count = 64 + + # Allocate two arrays. + source_array = Mem.alloc(Float32, thread_count) + destination_array = Mem.alloc(Float32, thread_count) + source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array) + destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array) + + # Fill the source and destination arrays. + Mem.upload!(source_array, fill(42.f0, thread_count)) + Mem.upload!(destination_array, zeros(Float32, thread_count)) + + # Run the kernel. + @cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer) + + @test Mem.download(Float32, destination_array, thread_count) == fill(42.f0, thread_count) +end + +end + +end diff --git a/test/device/interrupts.jl b/test/device/interrupts.jl new file mode 100644 index 00000000..a07a57e8 --- /dev/null +++ b/test/device/interrupts.jl @@ -0,0 +1,87 @@ +@testset "interrupts" begin + +############################################################################################ + +dummy() = return + +dummy_handler(kernel) = return + +@testset "@cuda_interruptible" begin + +@test_throws UndefVarError @cuda_interruptible dummy_handler undefined() +@test_throws MethodError @cuda_interruptible dummy_handler dummy(1) + +@testset "compilation params" begin + @cuda_interruptible dummy_handler dummy() + + @test_throws CuError @cuda_interruptible dummy_handler threads=2 maxthreads=1 dummy() + @cuda_interruptible dummy_handler threads=2 dummy() +end + +@testset "count" begin + # This test uses interrupts to increment a host counter and then + # checks that the counter's value equals the number of interrupts. + # This is a useful thing to check because it verifies that interrupts + # are neither skipped nor performed twice. + # + # We will use a sizeable number of threads (128) to give us a better + # shot at detecting concurrency errors, if any. The number of skipped + # interrupts is unlikely to equal the number of additional, unwanted + # interrupts for this many threads. + thread_count = 128 + + # Define a kernel that makes the host count. + function increment_counter() + interrupt() + return + end + + # Configure the interrupt to increment a counter. + global counter = 0 + function handle_interrupt() + global counter + counter += 1 + end + + # Run the kernel. + @cuda_interruptible handle_interrupt threads=thread_count increment_counter() + + # Check that the counter's final value equals the number + # of threads. + @test counter == thread_count +end + +@testset "count in stream" begin + # This test is a copy of the previous test, but it uses a non-default + # CUDA stream. This should Just Work: @cuda_interruptible should + # intercept the `stream=...` argument and pass it to the stream-querying + # logic. All of this should be entirely transparent to the user. + thread_count = 128 + + # Define a kernel that makes the host count. + function increment_counter() + interrupt() + return + end + + # Configure the interrupt to increment a counter. + global counter = 0 + function handle_interrupt() + global counter + counter += 1 + end + + # Define a CUDA stream. + exec_stream = CuStream() + + # Run the kernel. + @cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream increment_counter() + + # Check that the counter's final value equals the number + # of threads. + @test counter == thread_count +end + +end + +end diff --git a/test/device/threading.jl b/test/device/threading.jl new file mode 100644 index 00000000..fa9533b1 --- /dev/null +++ b/test/device/threading.jl @@ -0,0 +1,91 @@ +@testset "threading" begin + +############################################################################################ + +@testset "reader-writer lock" begin + +@testset "writers only" begin + + thread_count = 128 + + # Define a kernel that atomically increments a counter using a lock. + function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState}) + lock = ReaderWriterLock(lock_state) + writer_locked(lock) do + unsafe_store!(counter, unsafe_load(counter) + 1) + end + return + end + + # Allocate memory for the counter and the lock. + counter_buf = Mem.alloc(sizeof(Int32)) + Mem.upload!(counter_buf, [Int32(0)]) + counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf) + + lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState)) + Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)]) + lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf) + + # Run the kernel. + @cuda threads=thread_count increment_counter(counter_pointer, lock_pointer) + + # Check that the counter's final value equals the number + # of threads. + @test Mem.download(Int32, counter_buf) == [Int32(thread_count)] + +end + +@testset "readers and writers" begin + + thread_count = 128 + + # Define a kernel. + function mutate_counter_maybe(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + lock = ReaderWriterLock(lock_state) + # Read the previous counter and update the current counter. + # Do this many times. + if i % 16 == 0 + # Some threads get to atomically increment the counter. + writer_locked(lock) do + unsafe_store!(counter, unsafe_load(counter) + 1) + end + else + # All the other threads acquire the lock in reader mode + # and check that the counter's value doesn't change. + reader_locked(lock) do + counter_ptr = convert(Ptr{Int32}, convert(Csize_t, counter)) + counter_val = CUDAnative.volatile_load(counter_ptr) + j = 0 + while j < 10 + if CUDAnative.volatile_load(counter_ptr) != counter_val + throw(ErrorException("oh no")) + end + j += 1 + end + end + end + return + end + + # Allocate memory for the counter and the lock. + counter_buf = Mem.alloc(sizeof(Int32)) + Mem.upload!(counter_buf, [Int32(0)]) + counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf) + + lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState)) + Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)]) + lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf) + + # Run the kernel. + @cuda threads=thread_count mutate_counter_maybe(counter_pointer, lock_pointer) + + # Check that the counter's final value equals the number + # of threads. + @test Mem.download(Int32, counter_buf) == [Int32(thread_count / 16)] + +end + +end + +end diff --git a/test/runtests.jl b/test/runtests.jl index f4346620..d124cef9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -68,9 +68,12 @@ else else include("device/codegen.jl") include("device/execution.jl") + include("device/interrupts.jl") include("device/pointer.jl") include("device/array.jl") include("device/cuda.jl") + include("device/threading.jl") + include("device/gc.jl") include("examples.jl") end