From 5739881e2ca70811fd299978262e6accbad38650 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 21 Feb 2019 21:23:42 +0100 Subject: [PATCH 001/146] Implement a lowering for the intrinsics generated by 'LateLowerGCFrame' --- src/compiler/optim.jl | 106 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index b9ddf32a..305df00d 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -357,6 +357,112 @@ function lower_gc_frame!(fun::LLVM.Function) return changed end +# Visits all calls to a particular intrinsic in a given LLVM module. +function visit_intrinsic(visit_call::Function, name::AbstractString, mod::LLVM.Module) + if haskey(functions(mod), name) + func = functions(mod)[name] + + for use in uses(func) + call = user(use)::LLVM.CallInst + visit_call(call) + end + end +end + +# Lowers the GC intrinsics produce by the LateLowerGCFrame pass. These +# intrinsics are the last point at which we can intervene in the pipeline +# before the passes that deal with them become CPU-specific. +function lower_final_gc_intrinsics!(mod::LLVM.Module) + ctx = global_ctx::CompilerContext + changed = false + + # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates + # store for an object, including headroom, but does not set the object's + # tag. + visit_intrinsic("julia.gc_alloc_bytes", mod) do call + # Decode the call. + ops = collect(operands(call)) + sz = ops[2] + + # We need to reserve a single pointer of headroom for the tag. + # (LateLowerGCFrame depends on us doing that.) + headroom = Runtime.tag_size + + # Call the allocation function and bump the resulting pointer + # so the headroom sits just in front of the returned pointer. + let builder = Builder(JuliaContext()) + position!(builder, call) + ptr = call!(builder, Runtime.get(:gc_pool_alloc), [ConstantInt(Int32(headroom) + sz, JuliaContext())]) + bumped_ptr = gep!(builder, ptr, [ConstantInt(1, JuliaContext())]) + replace_uses!(call, bumped_ptr) + dispose(builder) + end + + changed = true + end + + # Next up: 'julia.new_gc_frame'. This intrinsic allocates a new GC frame. + # We'll lower it as an alloca and hope SSA construction and DCE passes + # get rid of the alloca. This is a reasonable thing to hope for because + # all intrinsics that may cause the GC frame to escape will be replaced by + # nops. + visit_intrinsic("julia.new_gc_frame", mod) do call + new_gc_frame = functions(mod)["julia.new_gc_frame"] + + # Decode the call. + ops = collect(operands(call)) + sz = ops[1] + + # Call the allocation function and bump the resulting pointer + # so the headroom sits just in front of the returned pointer. + let builder = Builder(JuliaContext()) + position!(builder, call) + ptr = array_alloca!(builder, eltype(return_type(new_gc_frame)), [sz]) + replace_uses!(call, ptr) + dispose(builder) + end + + changed = true + end + + # The 'julia.get_gc_frame_slot' is closely related to the previous + # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of + # a slot in the GC frame. We can simply turn this intrinsic into a GEP. + visit_intrinsic("julia.get_gc_frame_slot", mod) do call + # Decode the call. + ops = collect(operands(call)) + frame = ops[1] + offset = ops[2] + + # Call the allocation function and bump the resulting pointer + # so the headroom sits just in front of the returned pointer. + let builder = Builder(JuliaContext()) + position!(builder, call) + ptr = gep!(builder, frame, [offset]) + replace_uses!(call, ptr) + dispose(builder) + end + + changed = true + end + + # The 'julia.push_gc_frame' registers a GC frame with the GC. We + # don't have a GC, so we can just delete calls to this intrinsic! + visit_intrinsic("julia.push_gc_frame", mod) do call + unsafe_delete!(LLVM.parent(call), call) + changed = true + end + + # The 'julia.pop_gc_frame' unregisters a GC frame with the GC, so + # we can just delete calls to this intrinsic, too. + visit_intrinsic("julia.pop_gc_frame", mod) do call + unsafe_delete!(LLVM.parent(call), call) + changed = true + end + + return changed +end + # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible. # # this assumes and checks that the TLS is unused, which should be the case for most GPU code From 61b9f94529acee4a091b75139f41b998ada9d9fd Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 22 Feb 2019 11:47:09 +0100 Subject: [PATCH 002/146] Also lower 'julia.queue_gc_root' --- src/compiler/optim.jl | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 305df00d..91069ba3 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -358,7 +358,7 @@ function lower_gc_frame!(fun::LLVM.Function) end # Visits all calls to a particular intrinsic in a given LLVM module. -function visit_intrinsic(visit_call::Function, name::AbstractString, mod::LLVM.Module) +function visit_calls_to(visit_call::Function, name::AbstractString, mod::LLVM.Module) if haskey(functions(mod), name) func = functions(mod)[name] @@ -369,6 +369,17 @@ function visit_intrinsic(visit_call::Function, name::AbstractString, mod::LLVM.M end end +# Deletes all calls to a particular intrinsic in a given LLVM module. +# Returns a Boolean that tells if any calls were actually deleted. +function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool + changed = false + visit_calls_to(name, mod) do call + unsafe_delete!(LLVM.parent(call), call) + changed = true + end + return changed +end + # Lowers the GC intrinsics produce by the LateLowerGCFrame pass. These # intrinsics are the last point at which we can intervene in the pipeline # before the passes that deal with them become CPU-specific. @@ -379,7 +390,7 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module) # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates # store for an object, including headroom, but does not set the object's # tag. - visit_intrinsic("julia.gc_alloc_bytes", mod) do call + visit_calls_to("julia.gc_alloc_bytes", mod) do call # Decode the call. ops = collect(operands(call)) sz = ops[2] @@ -406,7 +417,7 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module) # get rid of the alloca. This is a reasonable thing to hope for because # all intrinsics that may cause the GC frame to escape will be replaced by # nops. - visit_intrinsic("julia.new_gc_frame", mod) do call + visit_calls_to("julia.new_gc_frame", mod) do call new_gc_frame = functions(mod)["julia.new_gc_frame"] # Decode the call. @@ -428,7 +439,7 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module) # The 'julia.get_gc_frame_slot' is closely related to the previous # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of # a slot in the GC frame. We can simply turn this intrinsic into a GEP. - visit_intrinsic("julia.get_gc_frame_slot", mod) do call + visit_calls_to("julia.get_gc_frame_slot", mod) do call # Decode the call. ops = collect(operands(call)) frame = ops[1] @@ -448,17 +459,14 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module) # The 'julia.push_gc_frame' registers a GC frame with the GC. We # don't have a GC, so we can just delete calls to this intrinsic! - visit_intrinsic("julia.push_gc_frame", mod) do call - unsafe_delete!(LLVM.parent(call), call) - changed = true - end + changed |= delete_calls_to!("julia.push_gc_frame", mod) # The 'julia.pop_gc_frame' unregisters a GC frame with the GC, so # we can just delete calls to this intrinsic, too. - visit_intrinsic("julia.pop_gc_frame", mod) do call - unsafe_delete!(LLVM.parent(call), call) - changed = true - end + changed |= delete_calls_to!("julia.pop_gc_frame", mod) + + # Ditto for 'julia.queue_gc_root'. + changed |= delete_calls_to!("julia.queue_gc_root", mod) return changed end From a921f3a139da947c2f60e58e0444278e91facf00 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 22 Feb 2019 12:55:40 +0100 Subject: [PATCH 003/146] Fix correctness bugs in the new GC lowering pass --- src/compiler/optim.jl | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 91069ba3..647e83c4 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -364,7 +364,7 @@ function visit_calls_to(visit_call::Function, name::AbstractString, mod::LLVM.Mo for use in uses(func) call = user(use)::LLVM.CallInst - visit_call(call) + visit_call(call, func) end end end @@ -373,7 +373,7 @@ end # Returns a Boolean that tells if any calls were actually deleted. function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool changed = false - visit_calls_to(name, mod) do call + visit_calls_to(name, mod) do call, _ unsafe_delete!(LLVM.parent(call), call) changed = true end @@ -390,10 +390,14 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module) # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates # store for an object, including headroom, but does not set the object's # tag. - visit_calls_to("julia.gc_alloc_bytes", mod) do call + visit_calls_to("julia.gc_alloc_bytes", mod) do call, gc_alloc_bytes + gc_alloc_bytes_ft = eltype(llvmtype(gc_alloc_bytes))::LLVM.FunctionType + T_ret = return_type(gc_alloc_bytes_ft)::LLVM.PointerType + T_bitcast = LLVM.PointerType(T_ret, LLVM.addrspace(T_ret)) + # Decode the call. ops = collect(operands(call)) - sz = ops[2] + size = ops[2] # We need to reserve a single pointer of headroom for the tag. # (LateLowerGCFrame depends on us doing that.) @@ -403,9 +407,12 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module) # so the headroom sits just in front of the returned pointer. let builder = Builder(JuliaContext()) position!(builder, call) - ptr = call!(builder, Runtime.get(:gc_pool_alloc), [ConstantInt(Int32(headroom) + sz, JuliaContext())]) - bumped_ptr = gep!(builder, ptr, [ConstantInt(1, JuliaContext())]) + total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext())) + ptr = call!(builder, Runtime.get(:gc_pool_alloc), [total_size]) + cast_ptr = bitcast!(builder, ptr, T_bitcast) + bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())]) replace_uses!(call, bumped_ptr) + unsafe_delete!(LLVM.parent(call), call) dispose(builder) end @@ -417,19 +424,22 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module) # get rid of the alloca. This is a reasonable thing to hope for because # all intrinsics that may cause the GC frame to escape will be replaced by # nops. - visit_calls_to("julia.new_gc_frame", mod) do call - new_gc_frame = functions(mod)["julia.new_gc_frame"] + visit_calls_to("julia.new_gc_frame", mod) do call, new_gc_frame + new_gc_frame_ft = eltype(llvmtype(new_gc_frame))::LLVM.FunctionType + T_ret = return_type(new_gc_frame_ft)::LLVM.PointerType + T_alloca = eltype(T_ret) # Decode the call. ops = collect(operands(call)) - sz = ops[1] + size = ops[1] # Call the allocation function and bump the resulting pointer # so the headroom sits just in front of the returned pointer. let builder = Builder(JuliaContext()) position!(builder, call) - ptr = array_alloca!(builder, eltype(return_type(new_gc_frame)), [sz]) + ptr = array_alloca!(builder, T_alloca, size) replace_uses!(call, ptr) + unsafe_delete!(LLVM.parent(call), call) dispose(builder) end @@ -439,7 +449,7 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module) # The 'julia.get_gc_frame_slot' is closely related to the previous # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of # a slot in the GC frame. We can simply turn this intrinsic into a GEP. - visit_calls_to("julia.get_gc_frame_slot", mod) do call + visit_calls_to("julia.get_gc_frame_slot", mod) do call, _ # Decode the call. ops = collect(operands(call)) frame = ops[1] @@ -451,6 +461,7 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module) position!(builder, call) ptr = gep!(builder, frame, [offset]) replace_uses!(call, ptr) + unsafe_delete!(LLVM.parent(call), call) dispose(builder) end From 80af54b760c427cf716adad65aefb22f79bd194a Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 27 Feb 2019 11:04:45 +0100 Subject: [PATCH 004/146] Use the new GC intrinsic lowering Note: these changes depend on the 'configurable-lowering-2' branch of my fork of the julia repo (jonathanvdc/julia). The lowering scheme won't work unless that version of Julia is used. --- src/compiler/optim.jl | 60 +++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 647e83c4..55b16ca6 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -27,7 +27,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) ModulePassManager() do pm initialize!(pm) - add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!)) + add!(pm, FunctionPass("LowerGCFrame", eager_lower_gc_frame!)) aggressive_dce!(pm) # remove dead uses of ptls add!(pm, ModulePass("LowerPTLS", lower_ptls!)) run!(pm, mod) @@ -45,15 +45,13 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) initialize!(pm) ccall(:jl_add_optimization_passes, Cvoid, (LLVM.API.LLVMPassManagerRef, Cint, Cint), - LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 0) + LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 1) run!(pm, mod) end ModulePassManager() do pm initialize!(pm) - - # lower intrinsics - add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!)) + add!(pm, ModulePass("FinalLowerGCGPU", lower_final_gc_intrinsics!)) aggressive_dce!(pm) # remove dead uses of ptls add!(pm, ModulePass("LowerPTLS", lower_ptls!)) @@ -298,6 +296,30 @@ function fixup_metadata!(f::LLVM.Function) end end +# Visits all calls to a particular intrinsic in a given LLVM module. +function visit_calls_to(visit_call::Function, name::AbstractString, mod::LLVM.Module) + if haskey(functions(mod), name) + func = functions(mod)[name] + + for use in uses(func) + call = user(use)::LLVM.CallInst + visit_call(call, func) + end + end +end + +# Deletes all calls to a particular intrinsic in a given LLVM module. +# Returns a Boolean that tells if any calls were actually deleted. +function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool + changed = false + visit_calls_to(name, mod) do call, _ + unsafe_delete!(LLVM.parent(call), call) + changed = true + end + return changed +end + + # lower object allocations to to PTX malloc # # this is a PoC implementation that is very simple: allocate, and never free. it also runs @@ -306,7 +328,7 @@ end # is currently very architecture/CPU specific: hard-coded pool sizes, TLS references, etc. # such IR is hard to clean-up, so we probably will need to have the GC lowering pass emit # lower-level intrinsics which then can be lowered to architecture-specific code. -function lower_gc_frame!(fun::LLVM.Function) +function eager_lower_gc_frame!(fun::LLVM.Function) job = current_job::CompilerJob mod = LLVM.parent(fun) changed = false @@ -357,34 +379,10 @@ function lower_gc_frame!(fun::LLVM.Function) return changed end -# Visits all calls to a particular intrinsic in a given LLVM module. -function visit_calls_to(visit_call::Function, name::AbstractString, mod::LLVM.Module) - if haskey(functions(mod), name) - func = functions(mod)[name] - - for use in uses(func) - call = user(use)::LLVM.CallInst - visit_call(call, func) - end - end -end - -# Deletes all calls to a particular intrinsic in a given LLVM module. -# Returns a Boolean that tells if any calls were actually deleted. -function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool - changed = false - visit_calls_to(name, mod) do call, _ - unsafe_delete!(LLVM.parent(call), call) - changed = true - end - return changed -end - -# Lowers the GC intrinsics produce by the LateLowerGCFrame pass. These +# Lowers the GC intrinsics produced by the LateLowerGCFrame pass. These # intrinsics are the last point at which we can intervene in the pipeline # before the passes that deal with them become CPU-specific. function lower_final_gc_intrinsics!(mod::LLVM.Module) - ctx = global_ctx::CompilerContext changed = false # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates From f177a271a3e79e0823e6c4257ee3f3e1b27f0d6e Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 28 Feb 2019 15:33:15 +0100 Subject: [PATCH 005/146] Add a simple unified memory example --- examples/shared-memory.jl | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 examples/shared-memory.jl diff --git a/examples/shared-memory.jl b/examples/shared-memory.jl new file mode 100644 index 00000000..e0fede72 --- /dev/null +++ b/examples/shared-memory.jl @@ -0,0 +1,31 @@ +using CUDAdrv, CUDAnative, CuArrays + +using Test + +# Allocates an array of host memory that is page-locked and accessible +# to the device. Maps the allocation into the CUDA address space. +# Returns a (host array, CuArray) pair. The former can be used by +# the host to access the array, the latter can be used by the device. +function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N} + # Allocate memory that is accessible to both the host and the device. + device_buffer = Mem.alloc(prod(dims) * sizeof(T), true) + + # Wrap the memory in an array for the host. + host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(device_buffer.ptr), dims; own = false) + + # Initialize the array's contents. + fill!(host_array, init) + + return host_array, CuArray{T, N}(device_buffer, dims; own = false) +end + +# Allocate a shared array. +dims = (2,4) +host_array, device_array = alloc_shared_array(dims, Int32(42)) + +# Write some values to the array. +host_array[1, 2] = 10 +host_array[2, 1] = 0 + +# Check that the host's version of the array is the same as the device's. +@test host_array == Array(device_array) From 5fd8a0a83875434321e2b1c73877c406ceecd79d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 28 Feb 2019 16:14:34 +0100 Subject: [PATCH 006/146] Add a host-to-device communication example --- examples/host-comm.jl | 77 +++++++++++++++++++++++++++++++++++++++ examples/shared-memory.jl | 11 +++++- 2 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 examples/host-comm.jl diff --git a/examples/host-comm.jl b/examples/host-comm.jl new file mode 100644 index 00000000..2467b680 --- /dev/null +++ b/examples/host-comm.jl @@ -0,0 +1,77 @@ +using CUDAdrv, CUDAnative, CuArrays +import CUDAdrv: @apicall +using Test + +# Allocates an array of host memory that is page-locked and accessible +# to the device. Maps the allocation into the CUDA address space. +# Returns a (host array, CuArray) pair. The former can be used by +# the host to access the array, the latter can be used by the device. +function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N} + # Allocate memory that is accessible to both the host and the device. + bytesize = prod(dims) * sizeof(T) + ptr_ref = Ref{Ptr{Cvoid}}() + @apicall( + :cuMemAllocHost, + (Ptr{Ptr{Cvoid}}, Csize_t), + ptr_ref, bytesize) + device_buffer = CUDAdrv.Mem.Buffer(ptr_ref[], bytesize, CuCurrentContext()) + + # Wrap the memory in an array for the host. + host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false) + + # Initialize the array's contents. + fill!(host_array, init) + + return host_array, CuArray{T, N}(device_buffer, dims; own = false) +end + +# This example shows that devices can communicate with the host +# and vice-versa *during* the execution of a kernel. +# +# What happens is, in chronological order: +# +# 1. A buffer is zero-initialized by the host. +# 2. A kernel is started on the device; said kernel +# waits for the buffer to become nonzero. +# 3. The host makes the buffer nonzero. +# 4. The kernel exists once the buffer is nonzero. +# + +function spin(a) + i = threadIdx().x + blockDim().x * (blockIdx().x-1) + # Make sure that 'a[i]' is actually zero when we get started. + if a[i] != 0.f0 + return + end + + # We wait for the host to set 'a[i]' to a nonzero value. + while true + ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ()) + if a[i] != 0.f0 + break + end + end + # Next, we set 'a[i]' to some magic value. + a[i] = 42.f0 + return +end + +# Allocate a shared array. +dims = (3,4) +host_array, device_array = alloc_shared_array(dims, 0.f0) + +# Launch the kernel. +@cuda threads=prod(dims) spin(device_array) + +# Go to sleep for a few milliseconds, to make sure +# that the kernel will have started already. +sleep(0.2) + +# Fill the array with ones now to unblock the kernel. +fill!(host_array, 1.f0) + +# Wait for the kernel to exit. +synchronize() + +# Check that the array has been set to the magic value. +@test host_array == fill(42.f0, dims) diff --git a/examples/shared-memory.jl b/examples/shared-memory.jl index e0fede72..9b946f73 100644 --- a/examples/shared-memory.jl +++ b/examples/shared-memory.jl @@ -1,4 +1,5 @@ using CUDAdrv, CUDAnative, CuArrays +import CUDAdrv: @apicall using Test @@ -8,10 +9,16 @@ using Test # the host to access the array, the latter can be used by the device. function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N} # Allocate memory that is accessible to both the host and the device. - device_buffer = Mem.alloc(prod(dims) * sizeof(T), true) + bytesize = prod(dims) * sizeof(T) + ptr_ref = Ref{Ptr{Cvoid}}() + @apicall( + :cuMemAllocHost, + (Ptr{Ptr{Cvoid}}, Csize_t), + ptr_ref, bytesize) + device_buffer = CUDAdrv.Mem.Buffer(ptr_ref[], bytesize, CuCurrentContext()) # Wrap the memory in an array for the host. - host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(device_buffer.ptr), dims; own = false) + host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false) # Initialize the array's contents. fill!(host_array, init) From 1c250c74e23603a568f60c4303daa736587fdad0 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 28 Feb 2019 17:32:50 +0100 Subject: [PATCH 007/146] Fix an outdated comment --- examples/host-comm.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/host-comm.jl b/examples/host-comm.jl index 2467b680..0f33e550 100644 --- a/examples/host-comm.jl +++ b/examples/host-comm.jl @@ -34,7 +34,8 @@ end # 2. A kernel is started on the device; said kernel # waits for the buffer to become nonzero. # 3. The host makes the buffer nonzero. -# 4. The kernel exists once the buffer is nonzero. +# 4. The kernel sets the buffer to a magic value and exits +# once the buffer is nonzero. # function spin(a) From f8e6c4b8c266de29e40a8c2fc2e56a7110d54efc Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 1 Mar 2019 18:03:08 +0100 Subject: [PATCH 008/146] Add a kwarg to '@cuda' that serves as a hook for kernel setup The 'init' kwarg to '@cuda' allows users to define custom kernel initialization logic, which is run just prior to the kernel. The main use case for this kwarg right now is setting up globals. --- src/execution.jl | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index aea26da4..10f9faa9 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -8,7 +8,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize # split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and # the code it generates, or the execution function split_kwargs(kwargs) - macro_kws = [:dynamic] + macro_kws = [:dynamic, :init] compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs] call_kws = [:cooperative, :blocks, :threads, :shmem, :stream] macro_kwargs = [] @@ -137,13 +137,14 @@ macro cuda(ex...) # handle keyword arguments that influence the macro's behavior dynamic = false + env_kwargs = [] for kwarg in macro_kwargs key,val = kwarg.args if key == :dynamic isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant value")) dynamic = val::Bool else - throw(ArgumentError("Unsupported keyword argument '$key'")) + push!(env_kwargs, kwarg) end end @@ -159,6 +160,7 @@ macro cuda(ex...) # we're in kernel land already, so no need to cudaconvert arguments local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)} local kernel = dynamic_cufunction($(esc(f)), kernel_tt) + prepare_kernel(kernel; $(map(esc, env_kwargs)...)) kernel($(var_exprs...); $(map(esc, call_kwargs)...)) end) else @@ -173,6 +175,7 @@ macro cuda(ex...) local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} local kernel = cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) + prepare_kernel(kernel; $(map(esc, env_kwargs)...)) kernel(kernel_args...; $(map(esc, call_kwargs)...)) end end) @@ -436,9 +439,25 @@ end return ex end +""" + prepare_kernel(kernel::Kernel{F,TT}; init::Function=nop_init_kernel) + +Prepares a kernel for execution by setting up an environment for that kernel. +This function should be invoked just prior to running the kernel. Its +functionality is included in [`@cuda`](@ref). + +The 'init' keyword argument is a function that takes a kernel as argument and +sets up an environment for the kernel. +""" +function prepare_kernel(kernel::Kernel{F,TT}; init::Function=nop_init_kernel) where {F,TT} + # Just call the 'init' function for now. + init(kernel) +end ## device-side API +# There doesn't seem to be a way to access the documentation for the call-syntax, +# so attach it to the type """ dynamic_cufunction(f, tt=Tuple{}) @@ -493,3 +512,8 @@ function nearest_warpsize(dev::CuDevice, threads::Integer) ws = CUDAdrv.warpsize(dev) return threads + (ws - threads % ws) % ws end + +function nop_init_kernel(kernel::Kernel{F,TT}) where {F,TT} + # Do nothing. + return +end \ No newline at end of file From 5426bece9e42aae80f0a3626189d2bb317a5ae5d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 1 Mar 2019 18:13:00 +0100 Subject: [PATCH 009/146] Add an example that initializes a kernel global --- examples/global-data.jl | 77 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 examples/global-data.jl diff --git a/examples/global-data.jl b/examples/global-data.jl new file mode 100644 index 00000000..2939612a --- /dev/null +++ b/examples/global-data.jl @@ -0,0 +1,77 @@ +using CUDAdrv, CUDAnative, LLVM, LLVM.Interop +using Test + +# This example shows that CUDAnative kernels can include global +# data, which may be set by the host. + +# Gets a pointer to a global with a particular name. If the global +# does not exist yet, then it is declared in the global memory address +# space. +@generated function get_global_pointer(::Val{global_name}, ::Type{T}) where {global_name, T} + T_global = convert(LLVMType, T) + T_result = convert(LLVMType, Ptr{T}) + + # Create a thunk that computes a pointer to the global. + llvm_f, _ = create_function(T_result) + mod = LLVM.parent(llvm_f) + + # Figure out if the global has been defined already. + globalSet = LLVM.globals(mod) + global_name_string = String(global_name) + if haskey(globalSet, global_name_string) + global_var = globalSet[global_name_string] + else + # If the global hasn't been defined already, then we'll define + # it in the global address space, i.e., address space one. + global_var = GlobalVariable(mod, T_global, global_name_string, 1) + LLVM.initializer!(global_var, LLVM.null(T_global)) + end + + # Generate IR that computes the global's address. + Builder(JuliaContext()) do builder + entry = BasicBlock(llvm_f, "entry", JuliaContext()) + position!(builder, entry) + + # Cast the global variable's type to the result type. + result = ptrtoint!(builder, global_var, T_result) + ret!(builder, result) + end + + # Call the function. + call_function(llvm_f, Ptr{T}) +end + +macro cuda_global_ptr(name, type) + return :(get_global_pointer( + $(Val(Symbol(name))), + $(esc(type)))) +end + +# Define a kernel that copies the global's value into an array. +function kernel(a::CUDAnative.DevicePtr{Float32}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + + ptr = @cuda_global_ptr("test_global", Float32) + Base.unsafe_store!(a, Base.unsafe_load(ptr), i) + return +end + +magic = 42.f0 + +# Define a kernel initialization function that sets the global +# to the magic value. +function kernel_init(kernel) + global_handle = CuGlobal{Float32}(kernel.mod, "test_global") + set(global_handle, magic) +end + +# Allocate a buffer on the GPU. +len = 12 +d_a = Mem.alloc(Float32, len) +ptr = Base.unsafe_convert(CuPtr{Float32}, d_a) + +# Run the kernel. +@cuda threads=len init=kernel_init kernel(ptr) + +# Test that the buffer has indeed been filled with the magic value. +@test Mem.download(Float32, d_a, len) == repeat([magic], len) From 537bfca209dbe9353cdc9bd0ce32a5f68a968b8d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 1 Mar 2019 19:18:03 +0100 Subject: [PATCH 010/146] Include an atomic cmpxchg example --- examples/atomic-exchange.jl | 95 +++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 examples/atomic-exchange.jl diff --git a/examples/atomic-exchange.jl b/examples/atomic-exchange.jl new file mode 100644 index 00000000..f200022d --- /dev/null +++ b/examples/atomic-exchange.jl @@ -0,0 +1,95 @@ +using CUDAdrv, CUDAnative, CUDAatomics, LLVM, LLVM.Interop +using Test + +# This example shows that it is possible to use LLVM's atomic compare +# and exchange instructions from CUDAnative kernels. + +# Gets a pointer to a global with a particular name. If the global +# does not exist yet, then it is declared in the global memory address +# space. +@generated function atomic_compare_exchange!(ptr::TPtr, cmp::T, new::T) where {TPtr,T} + T_ptr = convert(LLVMType, TPtr) + T_val = convert(LLVMType, T) + + # Create a thunk that performs the compare and exchange. + llvm_f, _ = create_function(T_val, [T_ptr, T_val, T_val]) + mod = LLVM.parent(llvm_f) + + # Generate IR for the thunk. + Builder(JuliaContext()) do builder + entry = BasicBlock(llvm_f, "entry", JuliaContext()) + position!(builder, entry) + + # Cast the pointer to an actual pointer. + ptr_val = parameters(llvm_f)[1] + if !isa(ptr_val, LLVM.PointerType) + ptr_val = inttoptr!( + builder, + ptr_val, + LLVM.PointerType(T_val)) + end + + # Perform an atomic compare and exchange. + # TODO: find a way to express the sequential consistency ordering + # that is less brittle than `UInt32(7)`. + seq_cst = UInt32(7) + cmpxchg_val = atomic_cmpxchg!( + builder, + ptr_val, + parameters(llvm_f)[2], + parameters(llvm_f)[3], + seq_cst, + seq_cst, + false) + + result = extract_value!(builder, cmpxchg_val, 0) + ret!(builder, result) + end + + # Call the function. + call_function(llvm_f, T, Tuple{TPtr, T, T}, :((ptr, cmp, new))) +end + +# A store that is implemented using an atomic compare and exchange. +# This is overkill as a store implementation, but it shows that +# atomic compare and exchange works. +function wacky_store!(ptr::CUDAnative.DevicePtr{T}, val::T, index::Integer) where T + atomic_compare_exchange!( + ptr + (index - 1) * sizeof(T), + unsafe_load(ptr, index), + val) +end + +# A kernel that swaps the contents of two buffers using atomic compare +# and exchange instructions. +function vswap(a::CUDAnative.DevicePtr{UInt32}, b::CUDAnative.DevicePtr{UInt32}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + a_val = unsafe_load(a, i) + b_val = unsafe_load(b, i) + wacky_store!(b, a_val, i) + wacky_store!(a, b_val, i) + return +end + +# Decide on buffer dimensions. +dims = (12,) +len = prod(dims) + +# Fill two buffers with random garbage. +a = UInt32.(round.(rand(Float32, dims) * 100)) +b = UInt32.(round.(rand(Float32, dims) * 100)) + +# Allocate buffers on the GPU. +d_a = Mem.alloc(UInt32, len) +Mem.upload!(d_a, a) +a_ptr = Base.unsafe_convert(CuPtr{UInt32}, d_a) +d_b = Mem.alloc(UInt32, len) +Mem.upload!(d_b, b) +b_ptr = Base.unsafe_convert(CuPtr{UInt32}, d_b) + +# Run the kernel. +@cuda threads=len vswap(a_ptr, b_ptr) + +# Test that the buffers have indeed been swapped. +@test Mem.download(UInt32, d_a, len) == b +@test Mem.download(UInt32, d_b, len) == a From 614d04b6b55569f24ff29894a7e2ea0503e0837d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 4 Mar 2019 13:32:06 +0100 Subject: [PATCH 011/146] Create a fully-featured interrupt example --- examples/interrupt.jl | 297 ++++++++++++++++++++++++++++++++++++++++++ src/execution.jl | 1 + 2 files changed, 298 insertions(+) create mode 100644 examples/interrupt.jl diff --git a/examples/interrupt.jl b/examples/interrupt.jl new file mode 100644 index 00000000..1c264900 --- /dev/null +++ b/examples/interrupt.jl @@ -0,0 +1,297 @@ +using CUDAdrv, CUDAnative, LLVM, LLVM.Interop +import CUDAdrv: @apicall +using Test + +# Allocates an array of host memory that is page-locked and accessible +# to the device. Maps the allocation into the CUDA address space. +# Returns a (host array, device buffer) pair. The former can be used by +# the host to access the array, the latter can be used by the device. +function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N} + # Allocate memory that is accessible to both the host and the device. + bytesize = prod(dims) * sizeof(T) + ptr_ref = Ref{Ptr{Cvoid}}() + @apicall( + :cuMemAllocHost, + (Ptr{Ptr{Cvoid}}, Csize_t), + ptr_ref, bytesize) + + device_buffer = CUDAdrv.Mem.Buffer(convert(CuPtr{T}, convert(Csize_t, ptr_ref[])), bytesize, CuCurrentContext()) + + # Wrap the memory in an array for the host. + host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false) + + # Initialize the array's contents. + fill!(host_array, init) + + return host_array, device_buffer +end + +# Queries a stream for its status. +function query_stream(stream::CUDAdrv.CuStream_t = C_NULL)::Cint + return ccall( + (:cuStreamQuery, CUDAdrv.libcuda), + Cint, + (CUDAdrv.CuStream_t,), + stream) +end + +# This example shows that it is possible to use LLVM's atomic compare +# and exchange instructions from CUDAnative kernels. + +# Gets a pointer to a global with a particular name. If the global +# does not exist yet, then it is declared in the global memory address +# space. +@generated function atomic_compare_exchange!(ptr::TPtr, cmp::T, new::T)::T where {TPtr,T} + ptr_type = convert(LLVMType, TPtr) + lt = string(convert(LLVMType, T)) + if isa(ptr_type, LLVM.PointerType) + ir = """ + %result = cmpxchg volatile $lt* %0, $lt %1, $lt %2 seq_cst seq_cst + %rv = extractvalue { $lt, i1 } %result, 0 + ret $lt %rv + """ + else + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst + %rv = extractvalue { $lt, i1 } %result, 0 + ret $lt %rv + """ + end + :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new)) +end + +# Loads a value from a pointer. +@generated function volatile_load(ptr::Ptr{T})::T where T + ptr_type = string(convert(LLVMType, Ptr{T})) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + %rv = load volatile $lt, $lt* %ptr + ret $lt %rv + """ + :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr)) +end + +# Stores a value at a particular address. +@generated function volatile_store!(ptr::Ptr{T}, value::T) where T + ptr_type = string(convert(LLVMType, Ptr{T})) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + store volatile $lt %1, $lt* %ptr + ret void + """ + :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value)) +end + +# Gets a pointer to a global with a particular name. If the global +# does not exist yet, then it is declared in the global memory address +# space. +@generated function get_global_pointer(::Val{global_name}, ::Type{T})::Ptr{T} where {global_name, T} + T_global = convert(LLVMType, T) + T_result = convert(LLVMType, Ptr{T}) + + # Create a thunk that computes a pointer to the global. + llvm_f, _ = create_function(T_result) + mod = LLVM.parent(llvm_f) + + # Figure out if the global has been defined already. + globalSet = LLVM.globals(mod) + global_name_string = String(global_name) + if haskey(globalSet, global_name_string) + global_var = globalSet[global_name_string] + else + # If the global hasn't been defined already, then we'll define + # it in the global address space, i.e., address space one. + global_var = GlobalVariable(mod, T_global, global_name_string, 1) + LLVM.initializer!(global_var, LLVM.null(T_global)) + end + + # Generate IR that computes the global's address. + Builder(JuliaContext()) do builder + entry = BasicBlock(llvm_f, "entry", JuliaContext()) + position!(builder, entry) + + # Cast the global variable's type to the result type. + result = ptrtoint!(builder, global_var, T_result) + ret!(builder, result) + end + + # Call the function. + call_function(llvm_f, Ptr{T}) +end + +macro cuda_global_ptr(name, type) + return :(get_global_pointer( + $(Val(Symbol(name))), + $(esc(type)))) +end + +# Gets a pointer to the interrupt region. +@inline function get_interrupt_pointer()::Ptr{UInt32} + # Compute a pointer to the global in which a pointer to the + # interrupt state is stored. + ptr = @cuda_global_ptr("interrupt_pointer", Ptr{UInt32}) + # state the pointer, netting us a pointer to the interrupt + # region. + return Base.unsafe_load(ptr) +end + +# The interrupt state is a 32-bit unsigned integer that +# can have one of the following values: +# +# * 0: host is ready to process an interrupt, no interrupt +# is currently being processed. +# * 1: device has requested an interrupt, the interrupt +# has not completed processing yet. +# +const ready = UInt32(0) +const processing = UInt32(1) + +# Requests an interrupt and waits until the interrupt +# completes. If an interrupt is already running, then +# nothing happens. Returns `true` if an interrupt was +# successfully started by this function; otherwise, +# `false`. +function interrupt_or_wait()::Bool + state_ptr = get_interrupt_pointer() + prev_state = atomic_compare_exchange!(state_ptr, ready, processing) + wait_for_interrupt() + return prev_state == ready +end + +# Waits for the current interrupt to finish, if an +# interrupt is currently running. +function wait_for_interrupt() + state_ptr = get_interrupt_pointer() + while volatile_load(state_ptr) == processing + end +end + +# Repeatedly requests an interrupt until one is requested +# successfully. +function interrupt() + while !interrupt_or_wait() + end +end + +# Waits for the current kernel to terminate and handle +# any interrupts that we encounter along the way. +function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CUDAdrv.CuStream_t = C_NULL) + while true + # Sleep to save processing power. + sleep(0.001) + + # Query the CUDA stream. + status = query_stream(stream) + if status == CUDAdrv.SUCCESS.code + # The kernel has finished running. We're done here. + return + elseif status == CUDAdrv.ERROR_NOT_READY.code + # The kernel is still running. Check if an interrupt + # needs handling. + if volatile_load(state) == processing + # Run the handler. + handler() + # Set the interrupt state to 'ready'. + volatile_store!(state, ready) + end + + # Continue querying the stream. + else + # Whoa. Something both unexpected and unpleasant seems + # to have happened. Better throw an exception here. + throw(CuError(status)) + end + end +end + +""" + @cuda_interruptible [kwargs...] func(args...) + +High-level interface for executing code on a GPU with support for interrups. +The `@cuda_interruptible` macro should prefix a call, with `func` a callable function +or object that should return nothing. It will be compiled to a CUDA function upon first +use, and to a certain extent arguments will be converted and anaged automatically using +`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel +launch on the current CUDA context. + +Several keyword arguments are supported that influence kernel compilation and execution. For +more information, refer to the documentation of respectively [`cufunction`](@ref) and +[`CUDAnative.Kernel`](@ref). +""" +macro cuda_interruptible(handler, ex...) + # destructure the `@cuda_interruptible` expression + if length(ex) > 0 && ex[1].head == :tuple + error("The tuple argument to @cuda has been replaced by keywords: `@cuda_interruptible threads=... fun(args...)`") + end + call = ex[end] + kwargs = ex[1:end-1] + + # destructure the kernel call + if call.head != :call + throw(ArgumentError("second argument to @cuda_interruptible should be a function call")) + end + f = call.args[1] + args = call.args[2:end] + + code = quote end + compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs) + vars, var_exprs = CUDAnative.assign_args!(code, args) + + # convert the arguments, call the compiler and launch the kernel + # while keeping the original arguments alive + push!(code.args, + quote + GC.@preserve $(vars...) begin + # Define a trivial buffer that contains the interrupt state. + local host_array, device_buffer = alloc_shared_array((1,), ready) + + # Define a kernel initialization function that sets the + # interrupt state pointer. + local function interrupt_kernel_init(kernel) + try + global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer") + set(global_handle, CuPtr{UInt32}(device_buffer.ptr)) + catch exception + # The interrupt pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end + end + + # Standard kernel setup logic. + local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),)) + local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} + local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) + CUDAnative.prepare_kernel(kernel; init=interrupt_kernel_init, $(map(esc, env_kwargs)...)) + kernel(kernel_args...; $(map(esc, call_kwargs)...)) + + # Handle interrupts. + handle_interrupts($(esc(handler)), pointer(host_array, 1)) + end + end) + return code +end + +# Define a kernel that invokes the host to do some work. +function kernel() + interrupt() + return +end + +thread_count = 64 + +# Run the kernel. +global counter = 0 +function handle_interrupt() + global counter + counter += 1 +end + +@cuda_interruptible handle_interrupt threads=thread_count kernel() + +@test counter == thread_count diff --git a/src/execution.jl b/src/execution.jl index 10f9faa9..ecfbefe5 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -104,6 +104,7 @@ kernel to determine the launch configuration. A host-side kernel launch is done kernel_args = cudaconvert.(args) kernel_tt = Tuple{Core.Typeof.(kernel_args)...} kernel = cufunction(f, kernel_tt; compilation_kwargs) + prepare_kernel(kernel; environment_kwargs) kernel(kernel_args...; launch_kwargs) end From 6ed1acf7d51f1fbe2c825b3f2ead0bbd95582e1b Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 4 Mar 2019 14:01:07 +0100 Subject: [PATCH 012/146] Update interrupt example to include memory transfer during interrupts --- examples/interrupt.jl | 54 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/examples/interrupt.jl b/examples/interrupt.jl index 1c264900..564f77a7 100644 --- a/examples/interrupt.jl +++ b/examples/interrupt.jl @@ -27,7 +27,7 @@ function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N} end # Queries a stream for its status. -function query_stream(stream::CUDAdrv.CuStream_t = C_NULL)::Cint +function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint return ccall( (:cuStreamQuery, CUDAdrv.libcuda), Cint, @@ -178,7 +178,7 @@ end # Waits for the current kernel to terminate and handle # any interrupts that we encounter along the way. -function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CUDAdrv.CuStream_t = C_NULL) +function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CuStream = CuDefaultStream()) while true # Sleep to save processing power. sleep(0.001) @@ -240,6 +240,15 @@ macro cuda_interruptible(handler, ex...) compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs) vars, var_exprs = CUDAnative.assign_args!(code, args) + # Find the stream on which the kernel is to be scheduled. + stream = CuDefaultStream() + for kwarg in call_kwargs + key, val = kwarg.args + if key == :stream + stream = val + end + end + # convert the arguments, call the compiler and launch the kernel # while keeping the original arguments alive push!(code.args, @@ -271,27 +280,50 @@ macro cuda_interruptible(handler, ex...) kernel(kernel_args...; $(map(esc, call_kwargs)...)) # Handle interrupts. - handle_interrupts($(esc(handler)), pointer(host_array, 1)) + handle_interrupts($(esc(handler)), pointer(host_array, 1), $(esc(stream))) end end) return code end -# Define a kernel that invokes the host to do some work. -function kernel() +# Define a kernel that copies some data from one array to another. +# The host is invoked to populate the source array. +function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x interrupt() + threadfence_system() + Base.unsafe_store!(b, Base.unsafe_load(a, i), i) return end thread_count = 64 -# Run the kernel. -global counter = 0 +# Allocate two arrays. +source_array = Mem.alloc(Float32, thread_count) +destination_array = Mem.alloc(Float32, thread_count) +source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array) +destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array) + +# Zero-fill the source and destination arrays. +Mem.upload!(source_array, zeros(Float32, thread_count)) +Mem.upload!(destination_array, zeros(Float32, thread_count)) + +# Define one stream for kernel execution and another for +# data transfer. +data_stream = CuStream() +exec_stream = CuStream() + +# Define a magic value. +magic = 42.f0 + +# Configure the interrupt to fill the input array with the magic value. function handle_interrupt() - global counter - counter += 1 + Mem.upload!(source_array, fill(magic, thread_count), data_stream; async = true) + synchronize(data_stream) end -@cuda_interruptible handle_interrupt threads=thread_count kernel() +# Run the kernel. +@cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream kernel(source_pointer, destination_pointer) -@test counter == thread_count +# Check that the destination buffer is as expected. +@test Mem.download(Float32, destination_array, thread_count) == fill(magic, thread_count) From d960a9164b6363dbad631c52ebd03cdd9bdf6d54 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 4 Mar 2019 14:50:53 +0100 Subject: [PATCH 013/146] Define a high-level interrupt interface --- src/CUDAnative.jl | 1 + src/interrupts.jl | 296 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 297 insertions(+) create mode 100644 src/interrupts.jl diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index fc0bbb60..30bfa9c1 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -34,6 +34,7 @@ include(joinpath("device", "runtime.jl")) include("compiler.jl") include("execution.jl") +include("interrupts.jl") include("reflection.jl") include("deprecated.jl") diff --git a/src/interrupts.jl b/src/interrupts.jl new file mode 100644 index 00000000..303bb209 --- /dev/null +++ b/src/interrupts.jl @@ -0,0 +1,296 @@ +# This file implements a high-level generic device-to-host interrupt +# mechanism. This file also contains non-trivial support infrastructure +# that should either be moved to CUDAdrv or exposed by CUDAnative. +# Note that this support infrastructure is not exported, so it remains +# an implementation detail as opposed to a part of CUDAnative's public +# API. + +import CUDAdrv: @apicall + +export @cuda_interruptible, interrupt, interrupt_or_wait, wait_for_interrupt + +# Allocates an array of host memory that is page-locked and accessible +# to the device. Maps the allocation into the CUDA address space. +# Returns a (host array, device buffer) pair. The former can be used by +# the host to access the array, the latter can be used by the device. +function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N} + # Allocate memory that is accessible to both the host and the device. + bytesize = prod(dims) * sizeof(T) + ptr_ref = Ref{Ptr{Cvoid}}() + @apicall( + :cuMemAllocHost, + (Ptr{Ptr{Cvoid}}, Csize_t), + ptr_ref, bytesize) + + device_buffer = CUDAdrv.Mem.Buffer(convert(CuPtr{T}, convert(Csize_t, ptr_ref[])), bytesize, CuCurrentContext()) + + # Wrap the memory in an array for the host. + host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false) + + # Initialize the array's contents. + fill!(host_array, init) + + return host_array, device_buffer +end + +# Frees an array of host memory. +function free_shared_array(buffer::Mem.Buffer) + ptr = convert(Ptr{Cvoid}, convert(Csize_t, buffer.ptr)) + @apicall( + :cuMemFreeHost, + (Ptr{Cvoid},), + ptr) +end + +# Queries a stream for its status. +function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint + return ccall( + (:cuStreamQuery, CUDAdrv.libcuda), + Cint, + (CUDAdrv.CuStream_t,), + stream) +end + +# Gets a pointer to a global with a particular name. If the global +# does not exist yet, then it is declared in the global memory address +# space. +@generated function atomic_compare_exchange!(ptr::Ptr{T}, cmp::T, new::T)::T where T + ptr_type = convert(LLVMType, Ptr{T}) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst + %rv = extractvalue { $lt, i1 } %result, 0 + ret $lt %rv + """ + :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new)) +end + +# Loads a value from a pointer. +@generated function volatile_load(ptr::Ptr{T})::T where T + ptr_type = string(convert(LLVMType, Ptr{T})) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + %rv = load volatile $lt, $lt* %ptr + ret $lt %rv + """ + :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr)) +end + +# Stores a value at a particular address. +@generated function volatile_store!(ptr::Ptr{T}, value::T) where T + ptr_type = string(convert(LLVMType, Ptr{T})) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + store volatile $lt %1, $lt* %ptr + ret void + """ + :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value)) +end + +# Gets a pointer to a global with a particular name. If the global +# does not exist yet, then it is declared in the global memory address +# space. +@generated function get_global_pointer(::Val{global_name}, ::Type{T})::Ptr{T} where {global_name, T} + T_global = convert(LLVMType, T) + T_result = convert(LLVMType, Ptr{T}) + + # Create a thunk that computes a pointer to the global. + llvm_f, _ = create_function(T_result) + mod = LLVM.parent(llvm_f) + + # Figure out if the global has been defined already. + globalSet = LLVM.globals(mod) + global_name_string = String(global_name) + if haskey(globalSet, global_name_string) + global_var = globalSet[global_name_string] + else + # If the global hasn't been defined already, then we'll define + # it in the global address space, i.e., address space one. + global_var = GlobalVariable(mod, T_global, global_name_string, 1) + LLVM.initializer!(global_var, LLVM.null(T_global)) + end + + # Generate IR that computes the global's address. + Builder(JuliaContext()) do builder + entry = BasicBlock(llvm_f, "entry", JuliaContext()) + position!(builder, entry) + + # Cast the global variable's type to the result type. + result = ptrtoint!(builder, global_var, T_result) + ret!(builder, result) + end + + # Call the function. + call_function(llvm_f, Ptr{T}) +end + +macro cuda_global_ptr(name, type) + return :(get_global_pointer( + $(Val(Symbol(name))), + $(esc(type)))) +end + +# Gets a pointer to the interrupt region. +@inline function get_interrupt_pointer()::Ptr{UInt32} + # Compute a pointer to the global in which a pointer to the + # interrupt state is stored. + ptr = @cuda_global_ptr("interrupt_pointer", Ptr{UInt32}) + # state the pointer, netting us a pointer to the interrupt + # region. + return Base.unsafe_load(ptr) +end + +# The interrupt state is a 32-bit unsigned integer that +# can have one of the following values: +# +# * 0: host is ready to process an interrupt, no interrupt +# is currently being processed. +# * 1: device has requested an interrupt, the interrupt +# has not completed processing yet. +# +const ready = UInt32(0) +const processing = UInt32(1) + +# Requests an interrupt and waits until the interrupt +# completes. If an interrupt is already running, then +# nothing happens. Returns `true` if an interrupt was +# successfully started by this function; otherwise, +# `false`. +function interrupt_or_wait()::Bool + state_ptr = get_interrupt_pointer() + prev_state = atomic_compare_exchange!(state_ptr, ready, processing) + wait_for_interrupt() + return prev_state == ready +end + +# Waits for the current interrupt to finish, if an +# interrupt is currently running. +function wait_for_interrupt() + state_ptr = get_interrupt_pointer() + while volatile_load(state_ptr) == processing + end +end + +# Repeatedly requests an interrupt until one is requested +# successfully. +function interrupt() + while !interrupt_or_wait() + end +end + +# Waits for the current kernel to terminate and handle +# any interrupts that we encounter along the way. +function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CuStream = CuDefaultStream()) + while true + # Sleep to save processing power. + sleep(0.001) + + # Query the CUDA stream. + status = query_stream(stream) + if status == CUDAdrv.SUCCESS.code + # The kernel has finished running. We're done here. + return + elseif status == CUDAdrv.ERROR_NOT_READY.code + # The kernel is still running. Check if an interrupt + # needs handling. + if volatile_load(state) == processing + # Run the handler. + handler() + # Set the interrupt state to 'ready'. + volatile_store!(state, ready) + end + + # Continue querying the stream. + else + # Whoa. Something both unexpected and unpleasant seems + # to have happened. Better throw an exception here. + throw(CuError(status)) + end + end +end + +""" + @cuda_interruptible [kwargs...] func(args...) + +High-level interface for executing code on a GPU with support for interrups. +The `@cuda_interruptible` macro should prefix a call, with `func` a callable function +or object that should return nothing. It will be compiled to a CUDA function upon first +use, and to a certain extent arguments will be converted and anaged automatically using +`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel +launch on the current CUDA context. + +Several keyword arguments are supported that influence kernel compilation and execution. For +more information, refer to the documentation of respectively [`cufunction`](@ref) and +[`CUDAnative.Kernel`](@ref). +""" +macro cuda_interruptible(handler, ex...) + # destructure the `@cuda_interruptible` expression + if length(ex) > 0 && ex[1].head == :tuple + error("The tuple argument to @cuda has been replaced by keywords: `@cuda_interruptible handler threads=... fun(args...)`") + end + call = ex[end] + kwargs = ex[1:end-1] + + # destructure the kernel call + if call.head != :call + throw(ArgumentError("second argument to @cuda_interruptible should be a function call")) + end + f = call.args[1] + args = call.args[2:end] + + code = quote end + compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs) + vars, var_exprs = CUDAnative.assign_args!(code, args) + + # Find the stream on which the kernel is to be scheduled. + stream = CuDefaultStream() + for kwarg in call_kwargs + key, val = kwarg.args + if key == :stream + stream = val + end + end + + # convert the arguments, call the compiler and launch the kernel + # while keeping the original arguments alive + push!(code.args, + quote + GC.@preserve $(vars...) begin + # Define a trivial buffer that contains the interrupt state. + local host_array, device_buffer = alloc_shared_array((1,), ready) + + try + # Define a kernel initialization function that sets the + # interrupt state pointer. + local function interrupt_kernel_init(kernel) + try + global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer") + set(global_handle, CuPtr{UInt32}(device_buffer.ptr)) + catch exception + # The interrupt pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end + end + + # Standard kernel setup logic. + local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),)) + local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} + local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) + CUDAnative.prepare_kernel(kernel; init=interrupt_kernel_init, $(map(esc, env_kwargs)...)) + kernel(kernel_args...; $(map(esc, call_kwargs)...)) + + # Handle interrupts. + handle_interrupts($(esc(handler)), pointer(host_array, 1), $(esc(stream))) + finally + free_shared_array(device_buffer) + end + end + end) + return code +end From 7c627c09aa5787f2049ecf64ad6c6a8484b994fc Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 4 Mar 2019 14:51:01 +0100 Subject: [PATCH 014/146] Refactor interrupt examples --- examples/interrupt-memory.jl | 44 +++++ examples/interrupt.jl | 324 +---------------------------------- 2 files changed, 53 insertions(+), 315 deletions(-) create mode 100644 examples/interrupt-memory.jl diff --git a/examples/interrupt-memory.jl b/examples/interrupt-memory.jl new file mode 100644 index 00000000..ac68e622 --- /dev/null +++ b/examples/interrupt-memory.jl @@ -0,0 +1,44 @@ +using CUDAdrv, CUDAnative +using Test + +# Define a kernel that copies some data from one array to another. +# The host is invoked to populate the source array. +function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + interrupt_or_wait() + threadfence_system() + Base.unsafe_store!(b, Base.unsafe_load(a, i), i) + return +end + +thread_count = 64 + +# Allocate two arrays. +source_array = Mem.alloc(Float32, thread_count) +destination_array = Mem.alloc(Float32, thread_count) +source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array) +destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array) + +# Zero-fill the source and destination arrays. +Mem.upload!(source_array, zeros(Float32, thread_count)) +Mem.upload!(destination_array, zeros(Float32, thread_count)) + +# Define one stream for kernel execution and another for +# data transfer. +data_stream = CuStream() +exec_stream = CuStream() + +# Define a magic value. +magic = 42.f0 + +# Configure the interrupt to fill the input array with the magic value. +function handle_interrupt() + Mem.upload!(source_array, fill(magic, thread_count), data_stream; async = true) + synchronize(data_stream) +end + +# Run the kernel. +@cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream kernel(source_pointer, destination_pointer) + +# Check that the destination buffer is as expected. +@test Mem.download(Float32, destination_array, thread_count) == fill(magic, thread_count) diff --git a/examples/interrupt.jl b/examples/interrupt.jl index 564f77a7..fd5bf155 100644 --- a/examples/interrupt.jl +++ b/examples/interrupt.jl @@ -1,329 +1,23 @@ -using CUDAdrv, CUDAnative, LLVM, LLVM.Interop -import CUDAdrv: @apicall +using CUDAdrv, CUDAnative using Test -# Allocates an array of host memory that is page-locked and accessible -# to the device. Maps the allocation into the CUDA address space. -# Returns a (host array, device buffer) pair. The former can be used by -# the host to access the array, the latter can be used by the device. -function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N} - # Allocate memory that is accessible to both the host and the device. - bytesize = prod(dims) * sizeof(T) - ptr_ref = Ref{Ptr{Cvoid}}() - @apicall( - :cuMemAllocHost, - (Ptr{Ptr{Cvoid}}, Csize_t), - ptr_ref, bytesize) - - device_buffer = CUDAdrv.Mem.Buffer(convert(CuPtr{T}, convert(Csize_t, ptr_ref[])), bytesize, CuCurrentContext()) - - # Wrap the memory in an array for the host. - host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false) - - # Initialize the array's contents. - fill!(host_array, init) - - return host_array, device_buffer -end - -# Queries a stream for its status. -function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint - return ccall( - (:cuStreamQuery, CUDAdrv.libcuda), - Cint, - (CUDAdrv.CuStream_t,), - stream) -end - -# This example shows that it is possible to use LLVM's atomic compare -# and exchange instructions from CUDAnative kernels. - -# Gets a pointer to a global with a particular name. If the global -# does not exist yet, then it is declared in the global memory address -# space. -@generated function atomic_compare_exchange!(ptr::TPtr, cmp::T, new::T)::T where {TPtr,T} - ptr_type = convert(LLVMType, TPtr) - lt = string(convert(LLVMType, T)) - if isa(ptr_type, LLVM.PointerType) - ir = """ - %result = cmpxchg volatile $lt* %0, $lt %1, $lt %2 seq_cst seq_cst - %rv = extractvalue { $lt, i1 } %result, 0 - ret $lt %rv - """ - else - ir = """ - %ptr = inttoptr $ptr_type %0 to $lt* - %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst - %rv = extractvalue { $lt, i1 } %result, 0 - ret $lt %rv - """ - end - :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new)) -end - -# Loads a value from a pointer. -@generated function volatile_load(ptr::Ptr{T})::T where T - ptr_type = string(convert(LLVMType, Ptr{T})) - lt = string(convert(LLVMType, T)) - ir = """ - %ptr = inttoptr $ptr_type %0 to $lt* - %rv = load volatile $lt, $lt* %ptr - ret $lt %rv - """ - :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr)) -end - -# Stores a value at a particular address. -@generated function volatile_store!(ptr::Ptr{T}, value::T) where T - ptr_type = string(convert(LLVMType, Ptr{T})) - lt = string(convert(LLVMType, T)) - ir = """ - %ptr = inttoptr $ptr_type %0 to $lt* - store volatile $lt %1, $lt* %ptr - ret void - """ - :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value)) -end - -# Gets a pointer to a global with a particular name. If the global -# does not exist yet, then it is declared in the global memory address -# space. -@generated function get_global_pointer(::Val{global_name}, ::Type{T})::Ptr{T} where {global_name, T} - T_global = convert(LLVMType, T) - T_result = convert(LLVMType, Ptr{T}) - - # Create a thunk that computes a pointer to the global. - llvm_f, _ = create_function(T_result) - mod = LLVM.parent(llvm_f) - - # Figure out if the global has been defined already. - globalSet = LLVM.globals(mod) - global_name_string = String(global_name) - if haskey(globalSet, global_name_string) - global_var = globalSet[global_name_string] - else - # If the global hasn't been defined already, then we'll define - # it in the global address space, i.e., address space one. - global_var = GlobalVariable(mod, T_global, global_name_string, 1) - LLVM.initializer!(global_var, LLVM.null(T_global)) - end - - # Generate IR that computes the global's address. - Builder(JuliaContext()) do builder - entry = BasicBlock(llvm_f, "entry", JuliaContext()) - position!(builder, entry) - - # Cast the global variable's type to the result type. - result = ptrtoint!(builder, global_var, T_result) - ret!(builder, result) - end - - # Call the function. - call_function(llvm_f, Ptr{T}) -end - -macro cuda_global_ptr(name, type) - return :(get_global_pointer( - $(Val(Symbol(name))), - $(esc(type)))) -end - -# Gets a pointer to the interrupt region. -@inline function get_interrupt_pointer()::Ptr{UInt32} - # Compute a pointer to the global in which a pointer to the - # interrupt state is stored. - ptr = @cuda_global_ptr("interrupt_pointer", Ptr{UInt32}) - # state the pointer, netting us a pointer to the interrupt - # region. - return Base.unsafe_load(ptr) -end - -# The interrupt state is a 32-bit unsigned integer that -# can have one of the following values: -# -# * 0: host is ready to process an interrupt, no interrupt -# is currently being processed. -# * 1: device has requested an interrupt, the interrupt -# has not completed processing yet. -# -const ready = UInt32(0) -const processing = UInt32(1) - -# Requests an interrupt and waits until the interrupt -# completes. If an interrupt is already running, then -# nothing happens. Returns `true` if an interrupt was -# successfully started by this function; otherwise, -# `false`. -function interrupt_or_wait()::Bool - state_ptr = get_interrupt_pointer() - prev_state = atomic_compare_exchange!(state_ptr, ready, processing) - wait_for_interrupt() - return prev_state == ready -end - -# Waits for the current interrupt to finish, if an -# interrupt is currently running. -function wait_for_interrupt() - state_ptr = get_interrupt_pointer() - while volatile_load(state_ptr) == processing - end -end - -# Repeatedly requests an interrupt until one is requested -# successfully. -function interrupt() - while !interrupt_or_wait() - end -end - -# Waits for the current kernel to terminate and handle -# any interrupts that we encounter along the way. -function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CuStream = CuDefaultStream()) - while true - # Sleep to save processing power. - sleep(0.001) - - # Query the CUDA stream. - status = query_stream(stream) - if status == CUDAdrv.SUCCESS.code - # The kernel has finished running. We're done here. - return - elseif status == CUDAdrv.ERROR_NOT_READY.code - # The kernel is still running. Check if an interrupt - # needs handling. - if volatile_load(state) == processing - # Run the handler. - handler() - # Set the interrupt state to 'ready'. - volatile_store!(state, ready) - end - - # Continue querying the stream. - else - # Whoa. Something both unexpected and unpleasant seems - # to have happened. Better throw an exception here. - throw(CuError(status)) - end - end -end - -""" - @cuda_interruptible [kwargs...] func(args...) - -High-level interface for executing code on a GPU with support for interrups. -The `@cuda_interruptible` macro should prefix a call, with `func` a callable function -or object that should return nothing. It will be compiled to a CUDA function upon first -use, and to a certain extent arguments will be converted and anaged automatically using -`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel -launch on the current CUDA context. - -Several keyword arguments are supported that influence kernel compilation and execution. For -more information, refer to the documentation of respectively [`cufunction`](@ref) and -[`CUDAnative.Kernel`](@ref). -""" -macro cuda_interruptible(handler, ex...) - # destructure the `@cuda_interruptible` expression - if length(ex) > 0 && ex[1].head == :tuple - error("The tuple argument to @cuda has been replaced by keywords: `@cuda_interruptible threads=... fun(args...)`") - end - call = ex[end] - kwargs = ex[1:end-1] - - # destructure the kernel call - if call.head != :call - throw(ArgumentError("second argument to @cuda_interruptible should be a function call")) - end - f = call.args[1] - args = call.args[2:end] - - code = quote end - compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs) - vars, var_exprs = CUDAnative.assign_args!(code, args) - - # Find the stream on which the kernel is to be scheduled. - stream = CuDefaultStream() - for kwarg in call_kwargs - key, val = kwarg.args - if key == :stream - stream = val - end - end - - # convert the arguments, call the compiler and launch the kernel - # while keeping the original arguments alive - push!(code.args, - quote - GC.@preserve $(vars...) begin - # Define a trivial buffer that contains the interrupt state. - local host_array, device_buffer = alloc_shared_array((1,), ready) - - # Define a kernel initialization function that sets the - # interrupt state pointer. - local function interrupt_kernel_init(kernel) - try - global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer") - set(global_handle, CuPtr{UInt32}(device_buffer.ptr)) - catch exception - # The interrupt pointer may not have been declared (because it is unused). - # In that case, we should do nothing. - if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code - rethrow() - end - end - end - - # Standard kernel setup logic. - local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),)) - local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} - local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) - CUDAnative.prepare_kernel(kernel; init=interrupt_kernel_init, $(map(esc, env_kwargs)...)) - kernel(kernel_args...; $(map(esc, call_kwargs)...)) - - # Handle interrupts. - handle_interrupts($(esc(handler)), pointer(host_array, 1), $(esc(stream))) - end - end) - return code -end - -# Define a kernel that copies some data from one array to another. -# The host is invoked to populate the source array. -function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32}) - i = (blockIdx().x-1) * blockDim().x + threadIdx().x +# Define a kernel that makes the host count. +function kernel() interrupt() - threadfence_system() - Base.unsafe_store!(b, Base.unsafe_load(a, i), i) return end thread_count = 64 -# Allocate two arrays. -source_array = Mem.alloc(Float32, thread_count) -destination_array = Mem.alloc(Float32, thread_count) -source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array) -destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array) - -# Zero-fill the source and destination arrays. -Mem.upload!(source_array, zeros(Float32, thread_count)) -Mem.upload!(destination_array, zeros(Float32, thread_count)) - -# Define one stream for kernel execution and another for -# data transfer. -data_stream = CuStream() -exec_stream = CuStream() - -# Define a magic value. -magic = 42.f0 - -# Configure the interrupt to fill the input array with the magic value. +# Configure the interrupt to increment a counter. +global counter = 0 function handle_interrupt() - Mem.upload!(source_array, fill(magic, thread_count), data_stream; async = true) - synchronize(data_stream) + global counter + counter += 1 end # Run the kernel. -@cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream kernel(source_pointer, destination_pointer) +@cuda_interruptible handle_interrupt threads=thread_count kernel() # Check that the destination buffer is as expected. -@test Mem.download(Float32, destination_array, thread_count) == fill(magic, thread_count) +@test counter == thread_count From c45f33df55b40b27370819a2c8caf1ce2749a6ce Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 4 Mar 2019 14:55:51 +0100 Subject: [PATCH 015/146] Document interrupt API --- src/interrupts.jl | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/src/interrupts.jl b/src/interrupts.jl index 303bb209..2eb6e7b1 100644 --- a/src/interrupts.jl +++ b/src/interrupts.jl @@ -154,11 +154,15 @@ end const ready = UInt32(0) const processing = UInt32(1) -# Requests an interrupt and waits until the interrupt -# completes. If an interrupt is already running, then -# nothing happens. Returns `true` if an interrupt was -# successfully started by this function; otherwise, -# `false`. +""" + interrupt_or_wait() + +Requests an interrupt and waits until the interrupt completes. +If an interrupt is already running, then this function waits +for that interrupt to complete, but does not request an interrupt +of its own. Returns `true` if an interrupt was successfully +requested by this function; otherwise, `false`. +""" function interrupt_or_wait()::Bool state_ptr = get_interrupt_pointer() prev_state = atomic_compare_exchange!(state_ptr, ready, processing) @@ -166,16 +170,23 @@ function interrupt_or_wait()::Bool return prev_state == ready end -# Waits for the current interrupt to finish, if an -# interrupt is currently running. +""" + wait_for_interrupt() + +Waits for the current interrupt to finish, if an interrupt is +currently running. +""" function wait_for_interrupt() state_ptr = get_interrupt_pointer() while volatile_load(state_ptr) == processing end end -# Repeatedly requests an interrupt until one is requested -# successfully. +""" + interrupt() + +Repeatedly requests an interrupt until one is requested successfully. +""" function interrupt() while !interrupt_or_wait() end From 7c6906b065ee265de296dac17ab4cc8ddc362fd5 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 4 Mar 2019 15:19:41 +0100 Subject: [PATCH 016/146] Define interrupt tests --- examples/interrupt.jl | 3 ++- test/device/interrupts.jl | 57 +++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 1 + 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 test/device/interrupts.jl diff --git a/examples/interrupt.jl b/examples/interrupt.jl index fd5bf155..a1c8f81e 100644 --- a/examples/interrupt.jl +++ b/examples/interrupt.jl @@ -19,5 +19,6 @@ end # Run the kernel. @cuda_interruptible handle_interrupt threads=thread_count kernel() -# Check that the destination buffer is as expected. +# Check that the counter's final value equals the number +# of threads. @test counter == thread_count diff --git a/test/device/interrupts.jl b/test/device/interrupts.jl new file mode 100644 index 00000000..0e6d8ab4 --- /dev/null +++ b/test/device/interrupts.jl @@ -0,0 +1,57 @@ +@testset "interrupts" begin + +############################################################################################ + +dummy() = return + +dummy_handler(kernel) = return + +@testset "@cuda_interruptible" begin + +@test_throws UndefVarError @cuda_interruptible dummy_handler undefined() +@test_throws MethodError @cuda_interruptible dummy_handler dummy(1) + +@testset "compilation params" begin + @cuda_interruptible dummy_handler dummy() + + @test_throws CuError @cuda_interruptible dummy_handler threads=2 maxthreads=1 dummy() + @cuda_interruptible dummy_handler threads=2 dummy() +end + +@testset "count" begin + + # This test uses interrupts to increment a host counter and then + # checks that the counter's value equals the number of interrupts. + # This is a useful thing to check because it verifies that interrupts + # are neither skipped nor performed twice. + # + # We will use a sizeable number of threads (128) to give us a better + # shot at detecting concurrency errors, if any. The number of skipped + # interrupts is unlikely to equal the number of additional, unwanted + # interrupts for this many threads. + thread_count = 128 + + # Define a kernel that makes the host count. + function increment_counter() + interrupt() + return + end + + # Configure the interrupt to increment a counter. + global counter = 0 + function handle_interrupt() + global counter + counter += 1 + end + + # Run the kernel. + @cuda_interruptible handle_interrupt threads=thread_count increment_counter() + + # Check that the counter's final value equals the number + # of threads. + @test counter == thread_count +end + +end + +end diff --git a/test/runtests.jl b/test/runtests.jl index 0ca46096..f382330d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -65,6 +65,7 @@ if CUDAnative.configured else include("device/codegen.jl") include("device/execution.jl") + include("device/interrupts.jl") include("device/pointer.jl") include("device/array.jl") include("device/cuda.jl") From 9c73a28da01015ce9b552b008689de3361a55558 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 4 Mar 2019 15:27:43 +0100 Subject: [PATCH 017/146] Add another interrupt test --- test/device/interrupts.jl | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/test/device/interrupts.jl b/test/device/interrupts.jl index 0e6d8ab4..a07a57e8 100644 --- a/test/device/interrupts.jl +++ b/test/device/interrupts.jl @@ -19,7 +19,6 @@ dummy_handler(kernel) = return end @testset "count" begin - # This test uses interrupts to increment a host counter and then # checks that the counter's value equals the number of interrupts. # This is a useful thing to check because it verifies that interrupts @@ -52,6 +51,37 @@ end @test counter == thread_count end +@testset "count in stream" begin + # This test is a copy of the previous test, but it uses a non-default + # CUDA stream. This should Just Work: @cuda_interruptible should + # intercept the `stream=...` argument and pass it to the stream-querying + # logic. All of this should be entirely transparent to the user. + thread_count = 128 + + # Define a kernel that makes the host count. + function increment_counter() + interrupt() + return + end + + # Configure the interrupt to increment a counter. + global counter = 0 + function handle_interrupt() + global counter + counter += 1 + end + + # Define a CUDA stream. + exec_stream = CuStream() + + # Run the kernel. + @cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream increment_counter() + + # Check that the counter's final value equals the number + # of threads. + @test counter == thread_count +end + end end From 47439eb7371cd15610d61c38ab157e5ed8ed8178 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 4 Mar 2019 15:34:56 +0100 Subject: [PATCH 018/146] Remove experimental examples I built these examples mostly as experiments. Their core logic ended up in 'interrupts.jl', which is cleverly designed to expose a high-level interface. The examples deleted by this commit are not: they're low-level and kind of hacky. --- examples/atomic-exchange.jl | 95 ------------------------------------- examples/global-data.jl | 77 ------------------------------ examples/host-comm.jl | 78 ------------------------------ examples/shared-memory.jl | 38 --------------- 4 files changed, 288 deletions(-) delete mode 100644 examples/atomic-exchange.jl delete mode 100644 examples/global-data.jl delete mode 100644 examples/host-comm.jl delete mode 100644 examples/shared-memory.jl diff --git a/examples/atomic-exchange.jl b/examples/atomic-exchange.jl deleted file mode 100644 index f200022d..00000000 --- a/examples/atomic-exchange.jl +++ /dev/null @@ -1,95 +0,0 @@ -using CUDAdrv, CUDAnative, CUDAatomics, LLVM, LLVM.Interop -using Test - -# This example shows that it is possible to use LLVM's atomic compare -# and exchange instructions from CUDAnative kernels. - -# Gets a pointer to a global with a particular name. If the global -# does not exist yet, then it is declared in the global memory address -# space. -@generated function atomic_compare_exchange!(ptr::TPtr, cmp::T, new::T) where {TPtr,T} - T_ptr = convert(LLVMType, TPtr) - T_val = convert(LLVMType, T) - - # Create a thunk that performs the compare and exchange. - llvm_f, _ = create_function(T_val, [T_ptr, T_val, T_val]) - mod = LLVM.parent(llvm_f) - - # Generate IR for the thunk. - Builder(JuliaContext()) do builder - entry = BasicBlock(llvm_f, "entry", JuliaContext()) - position!(builder, entry) - - # Cast the pointer to an actual pointer. - ptr_val = parameters(llvm_f)[1] - if !isa(ptr_val, LLVM.PointerType) - ptr_val = inttoptr!( - builder, - ptr_val, - LLVM.PointerType(T_val)) - end - - # Perform an atomic compare and exchange. - # TODO: find a way to express the sequential consistency ordering - # that is less brittle than `UInt32(7)`. - seq_cst = UInt32(7) - cmpxchg_val = atomic_cmpxchg!( - builder, - ptr_val, - parameters(llvm_f)[2], - parameters(llvm_f)[3], - seq_cst, - seq_cst, - false) - - result = extract_value!(builder, cmpxchg_val, 0) - ret!(builder, result) - end - - # Call the function. - call_function(llvm_f, T, Tuple{TPtr, T, T}, :((ptr, cmp, new))) -end - -# A store that is implemented using an atomic compare and exchange. -# This is overkill as a store implementation, but it shows that -# atomic compare and exchange works. -function wacky_store!(ptr::CUDAnative.DevicePtr{T}, val::T, index::Integer) where T - atomic_compare_exchange!( - ptr + (index - 1) * sizeof(T), - unsafe_load(ptr, index), - val) -end - -# A kernel that swaps the contents of two buffers using atomic compare -# and exchange instructions. -function vswap(a::CUDAnative.DevicePtr{UInt32}, b::CUDAnative.DevicePtr{UInt32}) - i = (blockIdx().x-1) * blockDim().x + threadIdx().x - a_val = unsafe_load(a, i) - b_val = unsafe_load(b, i) - wacky_store!(b, a_val, i) - wacky_store!(a, b_val, i) - return -end - -# Decide on buffer dimensions. -dims = (12,) -len = prod(dims) - -# Fill two buffers with random garbage. -a = UInt32.(round.(rand(Float32, dims) * 100)) -b = UInt32.(round.(rand(Float32, dims) * 100)) - -# Allocate buffers on the GPU. -d_a = Mem.alloc(UInt32, len) -Mem.upload!(d_a, a) -a_ptr = Base.unsafe_convert(CuPtr{UInt32}, d_a) -d_b = Mem.alloc(UInt32, len) -Mem.upload!(d_b, b) -b_ptr = Base.unsafe_convert(CuPtr{UInt32}, d_b) - -# Run the kernel. -@cuda threads=len vswap(a_ptr, b_ptr) - -# Test that the buffers have indeed been swapped. -@test Mem.download(UInt32, d_a, len) == b -@test Mem.download(UInt32, d_b, len) == a diff --git a/examples/global-data.jl b/examples/global-data.jl deleted file mode 100644 index 2939612a..00000000 --- a/examples/global-data.jl +++ /dev/null @@ -1,77 +0,0 @@ -using CUDAdrv, CUDAnative, LLVM, LLVM.Interop -using Test - -# This example shows that CUDAnative kernels can include global -# data, which may be set by the host. - -# Gets a pointer to a global with a particular name. If the global -# does not exist yet, then it is declared in the global memory address -# space. -@generated function get_global_pointer(::Val{global_name}, ::Type{T}) where {global_name, T} - T_global = convert(LLVMType, T) - T_result = convert(LLVMType, Ptr{T}) - - # Create a thunk that computes a pointer to the global. - llvm_f, _ = create_function(T_result) - mod = LLVM.parent(llvm_f) - - # Figure out if the global has been defined already. - globalSet = LLVM.globals(mod) - global_name_string = String(global_name) - if haskey(globalSet, global_name_string) - global_var = globalSet[global_name_string] - else - # If the global hasn't been defined already, then we'll define - # it in the global address space, i.e., address space one. - global_var = GlobalVariable(mod, T_global, global_name_string, 1) - LLVM.initializer!(global_var, LLVM.null(T_global)) - end - - # Generate IR that computes the global's address. - Builder(JuliaContext()) do builder - entry = BasicBlock(llvm_f, "entry", JuliaContext()) - position!(builder, entry) - - # Cast the global variable's type to the result type. - result = ptrtoint!(builder, global_var, T_result) - ret!(builder, result) - end - - # Call the function. - call_function(llvm_f, Ptr{T}) -end - -macro cuda_global_ptr(name, type) - return :(get_global_pointer( - $(Val(Symbol(name))), - $(esc(type)))) -end - -# Define a kernel that copies the global's value into an array. -function kernel(a::CUDAnative.DevicePtr{Float32}) - i = (blockIdx().x-1) * blockDim().x + threadIdx().x - - ptr = @cuda_global_ptr("test_global", Float32) - Base.unsafe_store!(a, Base.unsafe_load(ptr), i) - return -end - -magic = 42.f0 - -# Define a kernel initialization function that sets the global -# to the magic value. -function kernel_init(kernel) - global_handle = CuGlobal{Float32}(kernel.mod, "test_global") - set(global_handle, magic) -end - -# Allocate a buffer on the GPU. -len = 12 -d_a = Mem.alloc(Float32, len) -ptr = Base.unsafe_convert(CuPtr{Float32}, d_a) - -# Run the kernel. -@cuda threads=len init=kernel_init kernel(ptr) - -# Test that the buffer has indeed been filled with the magic value. -@test Mem.download(Float32, d_a, len) == repeat([magic], len) diff --git a/examples/host-comm.jl b/examples/host-comm.jl deleted file mode 100644 index 0f33e550..00000000 --- a/examples/host-comm.jl +++ /dev/null @@ -1,78 +0,0 @@ -using CUDAdrv, CUDAnative, CuArrays -import CUDAdrv: @apicall -using Test - -# Allocates an array of host memory that is page-locked and accessible -# to the device. Maps the allocation into the CUDA address space. -# Returns a (host array, CuArray) pair. The former can be used by -# the host to access the array, the latter can be used by the device. -function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N} - # Allocate memory that is accessible to both the host and the device. - bytesize = prod(dims) * sizeof(T) - ptr_ref = Ref{Ptr{Cvoid}}() - @apicall( - :cuMemAllocHost, - (Ptr{Ptr{Cvoid}}, Csize_t), - ptr_ref, bytesize) - device_buffer = CUDAdrv.Mem.Buffer(ptr_ref[], bytesize, CuCurrentContext()) - - # Wrap the memory in an array for the host. - host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false) - - # Initialize the array's contents. - fill!(host_array, init) - - return host_array, CuArray{T, N}(device_buffer, dims; own = false) -end - -# This example shows that devices can communicate with the host -# and vice-versa *during* the execution of a kernel. -# -# What happens is, in chronological order: -# -# 1. A buffer is zero-initialized by the host. -# 2. A kernel is started on the device; said kernel -# waits for the buffer to become nonzero. -# 3. The host makes the buffer nonzero. -# 4. The kernel sets the buffer to a magic value and exits -# once the buffer is nonzero. -# - -function spin(a) - i = threadIdx().x + blockDim().x * (blockIdx().x-1) - # Make sure that 'a[i]' is actually zero when we get started. - if a[i] != 0.f0 - return - end - - # We wait for the host to set 'a[i]' to a nonzero value. - while true - ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ()) - if a[i] != 0.f0 - break - end - end - # Next, we set 'a[i]' to some magic value. - a[i] = 42.f0 - return -end - -# Allocate a shared array. -dims = (3,4) -host_array, device_array = alloc_shared_array(dims, 0.f0) - -# Launch the kernel. -@cuda threads=prod(dims) spin(device_array) - -# Go to sleep for a few milliseconds, to make sure -# that the kernel will have started already. -sleep(0.2) - -# Fill the array with ones now to unblock the kernel. -fill!(host_array, 1.f0) - -# Wait for the kernel to exit. -synchronize() - -# Check that the array has been set to the magic value. -@test host_array == fill(42.f0, dims) diff --git a/examples/shared-memory.jl b/examples/shared-memory.jl deleted file mode 100644 index 9b946f73..00000000 --- a/examples/shared-memory.jl +++ /dev/null @@ -1,38 +0,0 @@ -using CUDAdrv, CUDAnative, CuArrays -import CUDAdrv: @apicall - -using Test - -# Allocates an array of host memory that is page-locked and accessible -# to the device. Maps the allocation into the CUDA address space. -# Returns a (host array, CuArray) pair. The former can be used by -# the host to access the array, the latter can be used by the device. -function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N} - # Allocate memory that is accessible to both the host and the device. - bytesize = prod(dims) * sizeof(T) - ptr_ref = Ref{Ptr{Cvoid}}() - @apicall( - :cuMemAllocHost, - (Ptr{Ptr{Cvoid}}, Csize_t), - ptr_ref, bytesize) - device_buffer = CUDAdrv.Mem.Buffer(ptr_ref[], bytesize, CuCurrentContext()) - - # Wrap the memory in an array for the host. - host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false) - - # Initialize the array's contents. - fill!(host_array, init) - - return host_array, CuArray{T, N}(device_buffer, dims; own = false) -end - -# Allocate a shared array. -dims = (2,4) -host_array, device_array = alloc_shared_array(dims, Int32(42)) - -# Write some values to the array. -host_array[1, 2] = 10 -host_array[2, 1] = 0 - -# Check that the host's version of the array is the same as the device's. -@test host_array == Array(device_array) From 297bedc4b206c42bd12f691ec37e1b1ce9d22743 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 5 Mar 2019 14:09:38 +0100 Subject: [PATCH 019/146] Implement a reader-writer lock --- examples/lock.jl | 31 ++++++++ src/CUDAnative.jl | 4 + src/device/threading.jl | 159 +++++++++++++++++++++++++++++++++++++++ src/interrupts.jl | 39 ---------- test/device/threading.jl | 91 ++++++++++++++++++++++ test/runtests.jl | 2 + 6 files changed, 287 insertions(+), 39 deletions(-) create mode 100644 examples/lock.jl create mode 100644 src/device/threading.jl create mode 100644 test/device/threading.jl diff --git a/examples/lock.jl b/examples/lock.jl new file mode 100644 index 00000000..b4269a7b --- /dev/null +++ b/examples/lock.jl @@ -0,0 +1,31 @@ +using CUDAdrv, CUDAnative +using Test + +thread_count = 128 + +# Define a kernel that atomically increments a counter using a lock. +function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState}) + lock = ReaderWriterLock(lock_state) + writer_locked(lock) do + unsafe_store!(counter, unsafe_load(counter) + 1) + end + return +end + +# Allocate memory for the counter and the lock. +counter_buf = Mem.alloc(sizeof(Int32)) +Mem.upload!(counter_buf, [Int32(0)]) +counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf) + +lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState)) +Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)]) +lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf) + +# @device_code_warntype increment_counter(counter_pointer, lock_pointer) + +# Run the kernel. +@cuda threads=thread_count increment_counter(counter_pointer, lock_pointer) + +# Check that the counter's final value equals the number +# of threads. +@test Mem.download(Int32, counter_buf) == [Int32(thread_count)] diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index 30bfa9c1..6f18eeb1 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -31,6 +31,10 @@ include(joinpath("device", "array.jl")) include(joinpath("device", "cuda.jl")) include(joinpath("device", "llvm.jl")) include(joinpath("device", "runtime.jl")) +include(joinpath("device", "libdevice.jl")) +include(joinpath("device", "cuda_intrinsics.jl")) +include(joinpath("device", "runtime_intrinsics.jl")) +include(joinpath("device", "threading.jl")) include("compiler.jl") include("execution.jl") diff --git a/src/device/threading.jl b/src/device/threading.jl new file mode 100644 index 00000000..8bbeadf9 --- /dev/null +++ b/src/device/threading.jl @@ -0,0 +1,159 @@ +# This file implements threading primitives that work for CUDAnative kernels. + +export ReaderWriterLock, reader_locked, writer_locked + +# Gets a pointer to a global with a particular name. If the global +# does not exist yet, then it is declared in the global memory address +# space. +@generated function atomic_compare_exchange!(ptr::Ptr{T}, cmp::T, new::T)::T where T + ptr_type = convert(LLVMType, Ptr{T}) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst + %rv = extractvalue { $lt, i1 } %result, 0 + ret $lt %rv + """ + :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new)) +end + +# Atomically adds a value to a variable pointed to by a pointer. +# Returns the previous value stored in that value. +@generated function atomic_add!(lhs::Ptr{T}, rhs::T)::T where T + ptr_type = convert(LLVMType, Ptr{T}) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + %rv = atomicrmw volatile add $lt* %ptr, $lt %1 seq_cst + ret $lt %rv + """ + :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T}, lhs, rhs)) +end + +# Loads a value from a pointer. +@generated function volatile_load(ptr::Ptr{T})::T where T + ptr_type = string(convert(LLVMType, Ptr{T})) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + %rv = load volatile $lt, $lt* %ptr + ret $lt %rv + """ + :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr)) +end + +# Stores a value at a particular address. +@generated function volatile_store!(ptr::Ptr{T}, value::T) where T + ptr_type = string(convert(LLVMType, Ptr{T})) + lt = string(convert(LLVMType, T)) + ir = """ + %ptr = inttoptr $ptr_type %0 to $lt* + store volatile $lt %1, $lt* %ptr + ret void + """ + :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value)) +end + +const ReaderWriterLockState = Int64 + +""" +A reader-writer lock: a lock that supports concurrent access for +read operations and exclusive access for write operations. +""" +struct ReaderWriterLock + # A pointer to the reader-writer lock's state. The state + # is a counter that can be in one of the following states: + # + # * > 0: the lock is acquired by one or more readers. + # The state counter describes the number of readers + # that have acquired the lock. + # + # * = 0: the lock is idle. + # + # * < 0: the lock is acquired by a single writer. + # + state_ptr::Ptr{ReaderWriterLockState} +end + +ReaderWriterLock(state_ptr::DevicePtr{ReaderWriterLockState}) = ReaderWriterLock( + convert(Ptr{ReaderWriterLockState}, convert(Csize_t, state_ptr))) + +const max_rw_lock_readers = (1 << (sizeof(ReaderWriterLockState) * 8 - 1)) + +# Serializes execution of a function within a warp, to combat thread +# divergence-related deadlocks. +function warp_serialized(func::Function) + # Get the current thread's ID. + thread_id = threadIdx().x - 1 + + # Get the size of a warp. + size = warpsize() + + local result + i = 0 + while i < size + if thread_id % size == i + result = func() + end + i += 1 + end + return result +end + +""" + reader_locked(func::Function, lock::ReaderWriterLock) + +Acquires a reader-writer lock in reader mode, runs `func` while the lock is +acquired and releases the lock again. +""" +function reader_locked(func::Function, lock::ReaderWriterLock) + warp_serialized() do + while true + # Increment the reader count. If the lock is in write-acquired mode, + # then the lock will stay in that mode (unless the reader count is + # exceeded, but that is virtually impossible). Otherwise, the lock + # will end up in read-acquired mode. + previous_state = atomic_add!(lock.state_ptr, 1) + + # If the lock was in the idle or read-acquired state, then + # it is now in read-acquired mode. + if previous_state >= 0 + # Run the function. + result = func() + # Decrement the reader count to release the reader lock. + atomic_add!(lock.state_ptr, -1) + # We're done here. + return result + end + + # Decrement the reader count and try again. + atomic_add!(lock.state_ptr, -1) + end + end +end + +""" + writer_locked(func::Function, lock::ReaderWriterLock) + +Acquires a reader-writer lock in writer mode, runs `func` while the lock is +acquired and releases the lock again. +""" +function writer_locked(func::Function, lock::ReaderWriterLock) + warp_serialized() do + # Try to move the lock from 'idle' to 'write-acquired'. + while atomic_compare_exchange!(lock.state_ptr, 0, -max_rw_lock_readers) != 0 + end + + # We acquired the lock. Run the function. + result = func() + + # Release the lock by atomically adding `max_rw_lock_readers` to the + # lock's state. It's important that we use an atomic add instead of a + # simple store because a store might cause a race condition with `read_locked` + # that'll put us in a deadlock state. + atomic_add!(lock.state_ptr, max_rw_lock_readers) + + # We're done here. + return result + end +end diff --git a/src/interrupts.jl b/src/interrupts.jl index 2eb6e7b1..03068387 100644 --- a/src/interrupts.jl +++ b/src/interrupts.jl @@ -51,45 +51,6 @@ function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint stream) end -# Gets a pointer to a global with a particular name. If the global -# does not exist yet, then it is declared in the global memory address -# space. -@generated function atomic_compare_exchange!(ptr::Ptr{T}, cmp::T, new::T)::T where T - ptr_type = convert(LLVMType, Ptr{T}) - lt = string(convert(LLVMType, T)) - ir = """ - %ptr = inttoptr $ptr_type %0 to $lt* - %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst - %rv = extractvalue { $lt, i1 } %result, 0 - ret $lt %rv - """ - :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new)) -end - -# Loads a value from a pointer. -@generated function volatile_load(ptr::Ptr{T})::T where T - ptr_type = string(convert(LLVMType, Ptr{T})) - lt = string(convert(LLVMType, T)) - ir = """ - %ptr = inttoptr $ptr_type %0 to $lt* - %rv = load volatile $lt, $lt* %ptr - ret $lt %rv - """ - :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr)) -end - -# Stores a value at a particular address. -@generated function volatile_store!(ptr::Ptr{T}, value::T) where T - ptr_type = string(convert(LLVMType, Ptr{T})) - lt = string(convert(LLVMType, T)) - ir = """ - %ptr = inttoptr $ptr_type %0 to $lt* - store volatile $lt %1, $lt* %ptr - ret void - """ - :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value)) -end - # Gets a pointer to a global with a particular name. If the global # does not exist yet, then it is declared in the global memory address # space. diff --git a/test/device/threading.jl b/test/device/threading.jl new file mode 100644 index 00000000..fa9533b1 --- /dev/null +++ b/test/device/threading.jl @@ -0,0 +1,91 @@ +@testset "threading" begin + +############################################################################################ + +@testset "reader-writer lock" begin + +@testset "writers only" begin + + thread_count = 128 + + # Define a kernel that atomically increments a counter using a lock. + function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState}) + lock = ReaderWriterLock(lock_state) + writer_locked(lock) do + unsafe_store!(counter, unsafe_load(counter) + 1) + end + return + end + + # Allocate memory for the counter and the lock. + counter_buf = Mem.alloc(sizeof(Int32)) + Mem.upload!(counter_buf, [Int32(0)]) + counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf) + + lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState)) + Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)]) + lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf) + + # Run the kernel. + @cuda threads=thread_count increment_counter(counter_pointer, lock_pointer) + + # Check that the counter's final value equals the number + # of threads. + @test Mem.download(Int32, counter_buf) == [Int32(thread_count)] + +end + +@testset "readers and writers" begin + + thread_count = 128 + + # Define a kernel. + function mutate_counter_maybe(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + lock = ReaderWriterLock(lock_state) + # Read the previous counter and update the current counter. + # Do this many times. + if i % 16 == 0 + # Some threads get to atomically increment the counter. + writer_locked(lock) do + unsafe_store!(counter, unsafe_load(counter) + 1) + end + else + # All the other threads acquire the lock in reader mode + # and check that the counter's value doesn't change. + reader_locked(lock) do + counter_ptr = convert(Ptr{Int32}, convert(Csize_t, counter)) + counter_val = CUDAnative.volatile_load(counter_ptr) + j = 0 + while j < 10 + if CUDAnative.volatile_load(counter_ptr) != counter_val + throw(ErrorException("oh no")) + end + j += 1 + end + end + end + return + end + + # Allocate memory for the counter and the lock. + counter_buf = Mem.alloc(sizeof(Int32)) + Mem.upload!(counter_buf, [Int32(0)]) + counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf) + + lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState)) + Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)]) + lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf) + + # Run the kernel. + @cuda threads=thread_count mutate_counter_maybe(counter_pointer, lock_pointer) + + # Check that the counter's final value equals the number + # of threads. + @test Mem.download(Int32, counter_buf) == [Int32(thread_count / 16)] + +end + +end + +end diff --git a/test/runtests.jl b/test/runtests.jl index f382330d..05e1687f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -69,6 +69,8 @@ if CUDAnative.configured include("device/pointer.jl") include("device/array.jl") include("device/cuda.jl") + include("device/intrinsics.jl") + include("device/threading.jl") #include("examples.jl") end From cfb6dd8e79385f1fa95136b57f748b5a4fbe012f Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 5 Mar 2019 18:07:30 +0100 Subject: [PATCH 020/146] Create an allocator prototype for the GC --- examples/gc-malloc.jl | 30 ++++ src/CUDAnative.jl | 1 + src/gc.jl | 325 ++++++++++++++++++++++++++++++++++++++++++ src/interrupts.jl | 2 +- 4 files changed, 357 insertions(+), 1 deletion(-) create mode 100644 examples/gc-malloc.jl create mode 100644 src/gc.jl diff --git a/examples/gc-malloc.jl b/examples/gc-malloc.jl new file mode 100644 index 00000000..597ed2ae --- /dev/null +++ b/examples/gc-malloc.jl @@ -0,0 +1,30 @@ +using CUDAdrv, CUDAnative +using Test + +# Define a kernel that copies values using a temporary buffer. +function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + buffer = Base.unsafe_convert(Ptr{Float32}, gc_malloc(sizeof(Float32) * Csize_t(16))) + + unsafe_store!(buffer, unsafe_load(a, i), i % 13) + unsafe_store!(b, unsafe_load(buffer, i % 13), i) + + return +end + +thread_count = 64 + +# Allocate two arrays. +source_array = Mem.alloc(Float32, thread_count) +destination_array = Mem.alloc(Float32, thread_count) +source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array) +destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array) + +# Fill the source and destination arrays. +Mem.upload!(source_array, fill(42.f0, thread_count)) +Mem.upload!(destination_array, zeros(Float32, thread_count)) + +# Run the kernel. +@cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) + +@test Mem.download(Float32, destination_array, thread_count) == fill(42.f0, thread_count) diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index 6f18eeb1..38d6dd3c 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -39,6 +39,7 @@ include(joinpath("device", "threading.jl")) include("compiler.jl") include("execution.jl") include("interrupts.jl") +include("gc.jl") include("reflection.jl") include("deprecated.jl") diff --git a/src/gc.jl b/src/gc.jl new file mode 100644 index 00000000..fb595e8e --- /dev/null +++ b/src/gc.jl @@ -0,0 +1,325 @@ +# This file contains a GC implementation for CUDAnative kernels. +# +# CURRENT STATE OF THE GC +# +# Simple memory allocation is underway. Memory allocation currently +# uses a simple free-list. +# +# END GOAL +# +# The CUDAnative GC is a precise, non-moving, mark-and-sweep GC that runs +# on the host. The device may trigger the GC via an interrupt. +# +# Some GPU-related GC implementation details: +# +# * GC memory is shared by the host and device. +# * Every thread gets a fixed region of memory for storing GC roots in. +# * When the device runs out of GC memory, it requests an interrupt +# to mark and sweep. + +export @cuda_gc, gc_malloc + +# An entry in the GC's free list. Every entry is placed at the +# start of an free memory chunk. The `next` pointer of a GC free +# list entry is aligned to a 16-byte boundary. +struct GCFreeListEntry + # The size of the entry. This size does not include the entry's + # `size` field, but it does include the `next` field. + size::Csize_t + # A pointer to the next entry in the free list. + next::Ptr{GCFreeListEntry} +end + +@generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name} + index = Base.fieldindex(TBase, field_name) + offset = Base.fieldoffset(TBase, index) + type = Core.fieldtype(TBase, index) + :(Base.unsafe_convert(Ptr{$type}, base_pointer + $(offset))) +end + +# Gets a pointer to a particular field. +macro get_field_pointer(base_pointer, field_name) + :(get_field_pointer_impl($(esc(base_pointer)), Val($field_name))) +end + +# A data structure that contains information relevant +# to the GC's inner workings. +struct GCMemoryInfo + # The head of the free list. + free_list_head::Ptr{GCFreeListEntry} +end + +# Gets the global GC interrupt lock. +@inline function get_interrupt_lock()::ReaderWriterLock + return ReaderWriterLock(@cuda_global_ptr("gc_interrupt_lock", ReaderWriterLockState)) +end + +# Gets a pointer to the global GC info data structure pointer. +@inline function get_gc_info_pointer()::Ptr{Ptr{GCMemoryInfo}} + return @cuda_global_ptr("gc_info_pointer", Ptr{GCMemoryInfo}) +end + +const gc_align = Csize_t(16) + +# Aligns a pointer to an alignment boundary. +function align_to_boundary(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T + address_int = Base.convert(Csize_t, address) + remainder = address_int % alignment + if remainder == Csize_t(0) + return address + else + return address + alignment - remainder + end +end + +# Tries to use a free-list entry to allocate a chunk of data of size `bytesize`. +# Updates the free list if the allocation succeeds. Returns a null pointer otherwise. +function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCFreeListEntry}}, entry::Ptr{GCFreeListEntry}, bytesize::Csize_t)::Ptr{UInt8} + entry_data = unsafe_load(entry) + if entry_data.size < bytesize + # The entry is just too small. Return a `null` pointer. + return C_NULL + end + + # The entry's big enough, so we'll use it. If at all possible, we want + # to create a new entry from any unused memory in the entry. + + # Compute the address to return. + data_address = Base.unsafe_convert(Ptr{UInt8}, entry) + sizeof(Csize_t) + + # Compute the end of the free memory chunk. + end_address = data_address + entry_data.size + + # Compute the start address of the new free list entry. The `next` + # field of that entry needs to be aligned to a 16-byte boundary, + # but the `size` field doesn't. + new_data_address = align_to_boundary(data_address + bytesize) + new_entry_address = new_data_address - sizeof(Csize_t) + if new_entry_address < data_address + bytesize + new_entry_address += gc_align + end + + # If we can place a new entry just past the allocation, then we should + # by all means do so. + if new_entry_address + sizeof(GCFreeListEntry) < end_address + # Create a new free list entry. + new_entry_size = Csize_t(end_address) - Csize_t(new_data_address) + new_entry_ptr = Base.unsafe_convert(Ptr{GCFreeListEntry}, new_entry_address) + unsafe_store!( + new_entry_ptr, + GCFreeListEntry(new_entry_size, entry_data.next)) + + # Update this entry's `size` field to reflect the new entry's space + # requirements. + unsafe_store!( + @get_field_pointer(entry, :size)::Ptr{Csize_t}, + entry_data.size - new_entry_size - sizeof(GCFreeListEntry)) + + # Update the free list pointer. + unsafe_store!(entry_ptr, new_entry_ptr) + else + # We can't create a new entry, but we still have to update the free + # list pointer. + unsafe_store!(entry_ptr, entry_data.next) + end + + return data_address +end + +# Tries to allocate a chunk of memory from a free list. +# Returns a null pointer if no sufficiently large chunk of +# memory can be found. +# +# `free_list_ptr` is a pointer to the head of the free list. +# +# This function is not thread-safe. +function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCFreeListEntry}}, bytesize::Csize_t)::Ptr{UInt8} + # To allocate memory, we will walk the free list until we find a suitable candidate. + while free_list_ptr != C_NULL + free_list_item = unsafe_load(free_list_ptr) + + if free_list_item == C_NULL + break + end + + result = gc_use_free_list_entry(free_list_ptr, free_list_item, bytesize) + if result != C_NULL + return result + end + + free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{GCFreeListEntry}} + end + return C_NULL +end + +# Tries to allocate a chunk of memory. +# Returns a null pointer if no sufficiently large chunk of +# memory can be found. +function gc_malloc_local(gc_info::Ptr{GCMemoryInfo}, bytesize::Csize_t)::Ptr{UInt8} + # TODO: reader-lock on the interrupt lock and writer-lock on the GC's + # lock. + writer_locked(get_interrupt_lock()) do + free_list_ptr = @get_field_pointer(gc_info, :free_list_head)::Ptr{Ptr{GCFreeListEntry}} + return gc_malloc_from_free_list(free_list_ptr, bytesize) + end +end + +# Allocates a blob of memory that is managed by the garbage collector. +# This function is designed to be called by the device. +function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} + gc_info = unsafe_load(get_gc_info_pointer()) + + # Try to malloc the object without host intervention. + ptr = gc_malloc_local(gc_info, bytesize) + if ptr != C_NULL + return ptr + end + + # We're out of memory. Ask the host to step in. + writer_locked(get_interrupt_lock()) do + interrupt_or_wait() + end + + # Try to malloc again. + ptr = gc_malloc_local(gc_info, bytesize) + if ptr != C_NULL + return ptr + end + + # Alright, so that was a spectacular failure. Let's just throw an exception. + @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", bytesize) + # throw(OutOfMemoryError()) + return C_NULL +end + +# Set the initial size of the chunk of memory allocated to the +# GC to 16MiB. +const initial_gc_memory_size = 16 * (1 << 20) + +# Initializes GC memory. +function gc_init(buffer::Array{UInt8, 1}) + buffer_ptr = pointer(buffer, 1) + + # Create a single free list entry. + first_entry_ptr = Base.unsafe_convert(Ptr{GCFreeListEntry}, buffer_ptr + sizeof(GCMemoryInfo)) + unsafe_store!( + first_entry_ptr, + GCFreeListEntry( + length(buffer) - sizeof(Csize_t) - sizeof(GCMemoryInfo), + C_NULL)) + + # Set up the main GC data structure. + gc_info = Base.unsafe_convert(Ptr{GCMemoryInfo}, buffer_ptr) + unsafe_store!( + gc_info, + GCMemoryInfo(first_entry_ptr)) +end + +# Triggers a GC collection. +function gc_collect(info::Ptr{GCMemoryInfo}) + println("GC collections are not implemented yet.") +end + +""" + @cuda_gc [kwargs...] func(args...) + +High-level interface for executing code on a GPU with GC support. +The `@cuda_gc` macro should prefix a call, with `func` a callable function +or object that should return nothing. It will be compiled to a CUDA function upon first +use, and to a certain extent arguments will be converted and anaged automatically using +`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel +launch on the current CUDA context. + +Several keyword arguments are supported that influence kernel compilation and execution. For +more information, refer to the documentation of respectively [`cufunction`](@ref) and +[`CUDAnative.Kernel`](@ref). +""" +macro cuda_gc(ex...) + # destructure the `@cuda_gc` expression + if length(ex) > 0 && ex[1].head == :tuple + error("The tuple argument to @cuda has been replaced by keywords: `@cuda_gc threads=... fun(args...)`") + end + call = ex[end] + kwargs = ex[1:end-1] + + # destructure the kernel call + if call.head != :call + throw(ArgumentError("second argument to @cuda_gc should be a function call")) + end + f = call.args[1] + args = call.args[2:end] + + code = quote end + compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs) + vars, var_exprs = CUDAnative.assign_args!(code, args) + + # Find the stream on which the kernel is to be scheduled. + stream = CuDefaultStream() + for kwarg in call_kwargs + key, val = kwarg.args + if key == :stream + stream = val + end + end + + # convert the arguments, call the compiler and launch the kernel + # while keeping the original arguments alive + push!(code.args, + quote + GC.@preserve $(vars...) begin + # Define a trivial buffer that contains the interrupt state. + local host_interrupt_array, device_interrupt_buffer = alloc_shared_array((1,), ready) + + # Allocate a shared buffer for GC memory. + local host_gc_array, device_gc_buffer = alloc_shared_array((initial_gc_memory_size,), UInt8(0)) + gc_init(host_gc_array) + + # Define a kernel initialization function. + local function kernel_init(kernel) + # Set the interrupt state pointer. + try + global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer") + set(global_handle, CuPtr{UInt32}(device_interrupt_buffer.ptr)) + catch exception + # The interrupt pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end + + # Set the GC state pointer. + try + global_handle = CuGlobal{CuPtr{GCMemoryInfo}}(kernel.mod, "gc_info_pointer") + set(global_handle, CuPtr{GCMemoryInfo}(device_gc_buffer.ptr)) + catch exception + # The GC info pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end + end + + local function handle_interrupt() + gc_collect(Ptr{GCMemoryInfo}(pointer(host_gc_array, 1))) + end + + try + # Standard kernel setup logic. + local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),)) + local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} + local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) + CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...)) + kernel(kernel_args...; $(map(esc, call_kwargs)...)) + + # Handle interrupts. + handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream))) + finally + free_shared_array(device_interrupt_buffer) + free_shared_array(device_gc_buffer) + end + end + end) + return code +end diff --git a/src/interrupts.jl b/src/interrupts.jl index 03068387..333545bf 100644 --- a/src/interrupts.jl +++ b/src/interrupts.jl @@ -187,7 +187,7 @@ end """ @cuda_interruptible [kwargs...] func(args...) -High-level interface for executing code on a GPU with support for interrups. +High-level interface for executing code on a GPU with support for interrupts. The `@cuda_interruptible` macro should prefix a call, with `func` a callable function or object that should return nothing. It will be compiled to a CUDA function upon first use, and to a certain extent arguments will be converted and anaged automatically using From 279f6ff30a71146a40c48315b7141b3d959edc96 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 11:38:49 +0100 Subject: [PATCH 021/146] Rename 'GCFreeListEntry' to 'GCAllocationRecord' --- src/gc.jl | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index fb595e8e..65f7942b 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -5,6 +5,13 @@ # Simple memory allocation is underway. Memory allocation currently # uses a simple free-list. # +# MEMORY ALLOCATION +# +# The GC's allocator uses free lists, i.e., the allocator maintains +# a list of all blocks that have not been allocated. Additionally, +# the allocator also maintains a list of all allocated blocks, so +# the collector knows which blocks it can free. +# # END GOAL # # The CUDAnative GC is a precise, non-moving, mark-and-sweep GC that runs @@ -19,15 +26,14 @@ export @cuda_gc, gc_malloc -# An entry in the GC's free list. Every entry is placed at the -# start of an free memory chunk. The `next` pointer of a GC free -# list entry is aligned to a 16-byte boundary. -struct GCFreeListEntry - # The size of the entry. This size does not include the entry's - # `size` field, but it does include the `next` field. +# A data structure that precedes every chunk of memory that has been +# allocated or put into the free list. +struct GCAllocationRecord + # The size of the memory region this allocation record precedes. + # This size does not include the allocation record itself. size::Csize_t # A pointer to the next entry in the free list. - next::Ptr{GCFreeListEntry} + next::Ptr{GCAllocationRecord} end @generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name} @@ -46,7 +52,7 @@ end # to the GC's inner workings. struct GCMemoryInfo # The head of the free list. - free_list_head::Ptr{GCFreeListEntry} + free_list_head::Ptr{GCAllocationRecord} end # Gets the global GC interrupt lock. @@ -74,7 +80,7 @@ end # Tries to use a free-list entry to allocate a chunk of data of size `bytesize`. # Updates the free list if the allocation succeeds. Returns a null pointer otherwise. -function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCFreeListEntry}}, entry::Ptr{GCFreeListEntry}, bytesize::Csize_t)::Ptr{UInt8} +function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCAllocationRecord}}, entry::Ptr{GCAllocationRecord}, bytesize::Csize_t)::Ptr{UInt8} entry_data = unsafe_load(entry) if entry_data.size < bytesize # The entry is just too small. Return a `null` pointer. @@ -101,19 +107,19 @@ function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCFreeListEntry}}, entry::Ptr # If we can place a new entry just past the allocation, then we should # by all means do so. - if new_entry_address + sizeof(GCFreeListEntry) < end_address + if new_entry_address + sizeof(GCAllocationRecord) < end_address # Create a new free list entry. new_entry_size = Csize_t(end_address) - Csize_t(new_data_address) - new_entry_ptr = Base.unsafe_convert(Ptr{GCFreeListEntry}, new_entry_address) + new_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, new_entry_address) unsafe_store!( new_entry_ptr, - GCFreeListEntry(new_entry_size, entry_data.next)) + GCAllocationRecord(new_entry_size, entry_data.next)) # Update this entry's `size` field to reflect the new entry's space # requirements. unsafe_store!( @get_field_pointer(entry, :size)::Ptr{Csize_t}, - entry_data.size - new_entry_size - sizeof(GCFreeListEntry)) + entry_data.size - new_entry_size - sizeof(GCAllocationRecord)) # Update the free list pointer. unsafe_store!(entry_ptr, new_entry_ptr) @@ -133,7 +139,7 @@ end # `free_list_ptr` is a pointer to the head of the free list. # # This function is not thread-safe. -function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCFreeListEntry}}, bytesize::Csize_t)::Ptr{UInt8} +function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCAllocationRecord}}, bytesize::Csize_t)::Ptr{UInt8} # To allocate memory, we will walk the free list until we find a suitable candidate. while free_list_ptr != C_NULL free_list_item = unsafe_load(free_list_ptr) @@ -147,7 +153,7 @@ function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCFreeListEntry}}, byte return result end - free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{GCFreeListEntry}} + free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{GCAllocationRecord}} end return C_NULL end @@ -159,7 +165,7 @@ function gc_malloc_local(gc_info::Ptr{GCMemoryInfo}, bytesize::Csize_t)::Ptr{UIn # TODO: reader-lock on the interrupt lock and writer-lock on the GC's # lock. writer_locked(get_interrupt_lock()) do - free_list_ptr = @get_field_pointer(gc_info, :free_list_head)::Ptr{Ptr{GCFreeListEntry}} + free_list_ptr = @get_field_pointer(gc_info, :free_list_head)::Ptr{Ptr{GCAllocationRecord}} return gc_malloc_from_free_list(free_list_ptr, bytesize) end end @@ -201,10 +207,10 @@ function gc_init(buffer::Array{UInt8, 1}) buffer_ptr = pointer(buffer, 1) # Create a single free list entry. - first_entry_ptr = Base.unsafe_convert(Ptr{GCFreeListEntry}, buffer_ptr + sizeof(GCMemoryInfo)) + first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, buffer_ptr + sizeof(GCMemoryInfo)) unsafe_store!( first_entry_ptr, - GCFreeListEntry( + GCAllocationRecord( length(buffer) - sizeof(Csize_t) - sizeof(GCMemoryInfo), C_NULL)) From c0c06e2882e64bbc11c265c982137880d7ed307a Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 11:47:13 +0100 Subject: [PATCH 022/146] Avoid partially overwriting allocation records --- src/gc.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 65f7942b..e6939247 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -91,16 +91,16 @@ function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCAllocationRecord}}, entry:: # to create a new entry from any unused memory in the entry. # Compute the address to return. - data_address = Base.unsafe_convert(Ptr{UInt8}, entry) + sizeof(Csize_t) + data_address = Base.unsafe_convert(Ptr{UInt8}, entry) + sizeof(GCAllocationRecord) # Compute the end of the free memory chunk. end_address = data_address + entry_data.size - # Compute the start address of the new free list entry. The `next` - # field of that entry needs to be aligned to a 16-byte boundary, - # but the `size` field doesn't. + # Compute the start address of the new free list entry. The data + # prefixed by the block needs to be aligned to a 16-byte boundary, + # but the block itself doesn't. new_data_address = align_to_boundary(data_address + bytesize) - new_entry_address = new_data_address - sizeof(Csize_t) + new_entry_address = new_data_address - sizeof(GCAllocationRecord) if new_entry_address < data_address + bytesize new_entry_address += gc_align end @@ -119,7 +119,7 @@ function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCAllocationRecord}}, entry:: # requirements. unsafe_store!( @get_field_pointer(entry, :size)::Ptr{Csize_t}, - entry_data.size - new_entry_size - sizeof(GCAllocationRecord)) + Csize_t(new_entry_address) - Csize_t(data_address)) # Update the free list pointer. unsafe_store!(entry_ptr, new_entry_ptr) @@ -211,7 +211,7 @@ function gc_init(buffer::Array{UInt8, 1}) unsafe_store!( first_entry_ptr, GCAllocationRecord( - length(buffer) - sizeof(Csize_t) - sizeof(GCMemoryInfo), + length(buffer) - sizeof(GCAllocationRecord) - sizeof(GCMemoryInfo), C_NULL)) # Set up the main GC data structure. From 79dc0d434adf14175a92a715ec051fefb04a6f85 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 11:58:20 +0100 Subject: [PATCH 023/146] Refactor GC collection triggering logic --- src/gc.jl | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index e6939247..63e13a40 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -32,7 +32,11 @@ struct GCAllocationRecord # The size of the memory region this allocation record precedes. # This size does not include the allocation record itself. size::Csize_t - # A pointer to the next entry in the free list. + + # A pointer to the next allocation record in the list. If this + # allocation record is part of the free list, then this pointer + # points to the next free list entry; otherwise, it points to the + # next entry in the list of allocated blocks. next::Ptr{GCAllocationRecord} end @@ -170,8 +174,12 @@ function gc_malloc_local(gc_info::Ptr{GCMemoryInfo}, bytesize::Csize_t)::Ptr{UIn end end -# Allocates a blob of memory that is managed by the garbage collector. -# This function is designed to be called by the device. +""" + gc_malloc(bytesize::Csize_t)::Ptr{UInt8} + +Allocates a blob of memory that is managed by the garbage collector. +This function is designed to be called by the device. +""" function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} gc_info = unsafe_load(get_gc_info_pointer()) @@ -182,9 +190,7 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} end # We're out of memory. Ask the host to step in. - writer_locked(get_interrupt_lock()) do - interrupt_or_wait() - end + gc_collect() # Try to malloc again. ptr = gc_malloc_local(gc_info, bytesize) @@ -198,6 +204,19 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} return C_NULL end +""" + gc_collect() + +Triggers a garbage collection phase. This function is designed +to be called by the device rather than by the host. +""" +function gc_collect() + writer_locked(get_interrupt_lock()) do + interrupt_or_wait() + threadfence_system() + end +end + # Set the initial size of the chunk of memory allocated to the # GC to 16MiB. const initial_gc_memory_size = 16 * (1 << 20) @@ -221,8 +240,9 @@ function gc_init(buffer::Array{UInt8, 1}) GCMemoryInfo(first_entry_ptr)) end -# Triggers a GC collection. -function gc_collect(info::Ptr{GCMemoryInfo}) +# Collects garbage. This function is designed to be called by +# the host, not by the device. +function gc_collect_impl(info::Ptr{GCMemoryInfo}) println("GC collections are not implemented yet.") end @@ -308,7 +328,7 @@ macro cuda_gc(ex...) end local function handle_interrupt() - gc_collect(Ptr{GCMemoryInfo}(pointer(host_gc_array, 1))) + gc_collect_impl(Ptr{GCMemoryInfo}(pointer(host_gc_array, 1))) end try From 563c3c052827e3f1edd33813c60c8ec232ab1ea7 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 12:08:25 +0100 Subject: [PATCH 024/146] Have the GC maintain a list of allocated blocks --- src/gc.jl | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 63e13a40..77dedbaf 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -57,6 +57,9 @@ end struct GCMemoryInfo # The head of the free list. free_list_head::Ptr{GCAllocationRecord} + + # The head of the allocation list. + allocation_list_head::Ptr{GCAllocationRecord} end # Gets the global GC interrupt lock. @@ -84,7 +87,12 @@ end # Tries to use a free-list entry to allocate a chunk of data of size `bytesize`. # Updates the free list if the allocation succeeds. Returns a null pointer otherwise. -function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCAllocationRecord}}, entry::Ptr{GCAllocationRecord}, bytesize::Csize_t)::Ptr{UInt8} +function gc_use_free_list_entry( + entry_ptr::Ptr{Ptr{GCAllocationRecord}}, + allocation_list_ptr::Ptr{Ptr{GCAllocationRecord}}, + entry::Ptr{GCAllocationRecord}, + bytesize::Csize_t,)::Ptr{UInt8} + entry_data = unsafe_load(entry) if entry_data.size < bytesize # The entry is just too small. Return a `null` pointer. @@ -133,6 +141,18 @@ function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCAllocationRecord}}, entry:: unsafe_store!(entry_ptr, entry_data.next) end + # At this point, all we need to do is update the allocation record to + # reflect the fact that it now represents an allocated block instead of + # a free block. + + # Set the `next` pointer to the value stored at the allocation list pointer. + unsafe_store!( + @get_field_pointer(entry, :next)::Ptr{Ptr{GCAllocationRecord}}, + unsafe_load(allocation_list_ptr)) + + # Update the allocation list pointer to point to the entry. + unsafe_store!(allocation_list_ptr, entry) + return data_address end @@ -141,9 +161,13 @@ end # memory can be found. # # `free_list_ptr` is a pointer to the head of the free list. +# `allocation_list_ptr` is a pointer to the head of the allocation list. # # This function is not thread-safe. -function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCAllocationRecord}}, bytesize::Csize_t)::Ptr{UInt8} +function gc_malloc_from_free_list( + free_list_ptr::Ptr{Ptr{GCAllocationRecord}}, + allocation_list_ptr::Ptr{Ptr{GCAllocationRecord}}, + bytesize::Csize_t)::Ptr{UInt8} # To allocate memory, we will walk the free list until we find a suitable candidate. while free_list_ptr != C_NULL free_list_item = unsafe_load(free_list_ptr) @@ -152,7 +176,7 @@ function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCAllocationRecord}}, b break end - result = gc_use_free_list_entry(free_list_ptr, free_list_item, bytesize) + result = gc_use_free_list_entry(free_list_ptr, allocation_list_ptr, free_list_item, bytesize) if result != C_NULL return result end @@ -170,7 +194,8 @@ function gc_malloc_local(gc_info::Ptr{GCMemoryInfo}, bytesize::Csize_t)::Ptr{UIn # lock. writer_locked(get_interrupt_lock()) do free_list_ptr = @get_field_pointer(gc_info, :free_list_head)::Ptr{Ptr{GCAllocationRecord}} - return gc_malloc_from_free_list(free_list_ptr, bytesize) + allocation_list_ptr = @get_field_pointer(gc_info, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}} + return gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize) end end @@ -237,7 +262,7 @@ function gc_init(buffer::Array{UInt8, 1}) gc_info = Base.unsafe_convert(Ptr{GCMemoryInfo}, buffer_ptr) unsafe_store!( gc_info, - GCMemoryInfo(first_entry_ptr)) + GCMemoryInfo(first_entry_ptr, C_NULL)) end # Collects garbage. This function is designed to be called by From 4ffc62febe59eb55e517b44d3a1c33942f262cfd Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 12:35:48 +0100 Subject: [PATCH 025/146] Introduce the notion of a GC master record --- src/gc.jl | 63 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 77dedbaf..8496fd1d 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -52,9 +52,10 @@ macro get_field_pointer(base_pointer, field_name) :(get_field_pointer_impl($(esc(base_pointer)), Val($field_name))) end -# A data structure that contains information relevant -# to the GC's inner workings. -struct GCMemoryInfo +# A data structure that describes a single GC "arena", i.e., +# a section of the heap that is managed by the GC. Every arena +# has its own free list and allocation list. +struct GCArenaRecord # The head of the free list. free_list_head::Ptr{GCAllocationRecord} @@ -62,14 +63,22 @@ struct GCMemoryInfo allocation_list_head::Ptr{GCAllocationRecord} end +# A data structure that contains global GC info. This data +# structure is designed to be immutable: it should not be changed +# once the host has set it up. +struct GCMasterRecord + # A pointer to the global GC arena. + global_arena::Ptr{GCArenaRecord} +end + # Gets the global GC interrupt lock. @inline function get_interrupt_lock()::ReaderWriterLock return ReaderWriterLock(@cuda_global_ptr("gc_interrupt_lock", ReaderWriterLockState)) end -# Gets a pointer to the global GC info data structure pointer. -@inline function get_gc_info_pointer()::Ptr{Ptr{GCMemoryInfo}} - return @cuda_global_ptr("gc_info_pointer", Ptr{GCMemoryInfo}) +# Gets a pointer to the GC master record. +@inline function get_gc_master_record()::Ptr{GCMasterRecord} + return @cuda_global_ptr("gc_master_record", GCMasterRecord) end const gc_align = Csize_t(16) @@ -186,15 +195,15 @@ function gc_malloc_from_free_list( return C_NULL end -# Tries to allocate a chunk of memory. +# Tries to allocate a chunk of memory in a particular GC arena. # Returns a null pointer if no sufficiently large chunk of # memory can be found. -function gc_malloc_local(gc_info::Ptr{GCMemoryInfo}, bytesize::Csize_t)::Ptr{UInt8} +function gc_malloc_local(arena::Ptr{GCArenaRecord}, bytesize::Csize_t)::Ptr{UInt8} # TODO: reader-lock on the interrupt lock and writer-lock on the GC's # lock. writer_locked(get_interrupt_lock()) do - free_list_ptr = @get_field_pointer(gc_info, :free_list_head)::Ptr{Ptr{GCAllocationRecord}} - allocation_list_ptr = @get_field_pointer(gc_info, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}} + free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}} + allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}} return gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize) end end @@ -206,10 +215,10 @@ Allocates a blob of memory that is managed by the garbage collector. This function is designed to be called by the device. """ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} - gc_info = unsafe_load(get_gc_info_pointer()) + master_record = unsafe_load(get_gc_master_record()) # Try to malloc the object without host intervention. - ptr = gc_malloc_local(gc_info, bytesize) + ptr = gc_malloc_local(master_record.global_arena, bytesize) if ptr != C_NULL return ptr end @@ -218,7 +227,7 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} gc_collect() # Try to malloc again. - ptr = gc_malloc_local(gc_info, bytesize) + ptr = gc_malloc_local(master_record.global_arena, bytesize) if ptr != C_NULL return ptr end @@ -246,28 +255,30 @@ end # GC to 16MiB. const initial_gc_memory_size = 16 * (1 << 20) -# Initializes GC memory. -function gc_init(buffer::Array{UInt8, 1}) +# Initializes GC memory and produces a master record. +function gc_init(buffer::Array{UInt8, 1})::GCMasterRecord buffer_ptr = pointer(buffer, 1) # Create a single free list entry. - first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, buffer_ptr + sizeof(GCMemoryInfo)) + first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, buffer_ptr + sizeof(GCArenaRecord)) unsafe_store!( first_entry_ptr, GCAllocationRecord( - length(buffer) - sizeof(GCAllocationRecord) - sizeof(GCMemoryInfo), + length(buffer) - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord), C_NULL)) # Set up the main GC data structure. - gc_info = Base.unsafe_convert(Ptr{GCMemoryInfo}, buffer_ptr) + global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, buffer_ptr) unsafe_store!( - gc_info, - GCMemoryInfo(first_entry_ptr, C_NULL)) + global_arena, + GCArenaRecord(first_entry_ptr, C_NULL)) + + return GCMasterRecord(global_arena) end # Collects garbage. This function is designed to be called by # the host, not by the device. -function gc_collect_impl(info::Ptr{GCMemoryInfo}) +function gc_collect_impl(master_record::GCMasterRecord) println("GC collections are not implemented yet.") end @@ -323,7 +334,7 @@ macro cuda_gc(ex...) # Allocate a shared buffer for GC memory. local host_gc_array, device_gc_buffer = alloc_shared_array((initial_gc_memory_size,), UInt8(0)) - gc_init(host_gc_array) + local master_record = gc_init(host_gc_array) # Define a kernel initialization function. local function kernel_init(kernel) @@ -339,10 +350,10 @@ macro cuda_gc(ex...) end end - # Set the GC state pointer. + # Set the GC master record. try - global_handle = CuGlobal{CuPtr{GCMemoryInfo}}(kernel.mod, "gc_info_pointer") - set(global_handle, CuPtr{GCMemoryInfo}(device_gc_buffer.ptr)) + global_handle = CuGlobal{GCMasterRecord}(kernel.mod, "gc_master_record") + set(global_handle, master_record) catch exception # The GC info pointer may not have been declared (because it is unused). # In that case, we should do nothing. @@ -353,7 +364,7 @@ macro cuda_gc(ex...) end local function handle_interrupt() - gc_collect_impl(Ptr{GCMemoryInfo}(pointer(host_gc_array, 1))) + gc_collect_impl(master_record) end try From 454a6ef134a71fa15dd7947a7374d2c2c4dcdfa4 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 12:54:29 +0100 Subject: [PATCH 026/146] Reserve GC memory for GC frames --- src/gc.jl | 67 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 20 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 8496fd1d..b549bad8 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -63,12 +63,21 @@ struct GCArenaRecord allocation_list_head::Ptr{GCAllocationRecord} end +# A reference to a Julia object. +const ObjectRef = Ptr{Nothing} + # A data structure that contains global GC info. This data # structure is designed to be immutable: it should not be changed # once the host has set it up. struct GCMasterRecord # A pointer to the global GC arena. global_arena::Ptr{GCArenaRecord} + + # The size of a GC root buffer. + root_buffer_size::Csize_t + + # A pointer to a list of buffers that can be used to store GC roots in. + root_buffers::Ptr{ObjectRef} end # Gets the global GC interrupt lock. @@ -251,29 +260,37 @@ function gc_collect() end end -# Set the initial size of the chunk of memory allocated to the -# GC to 16MiB. -const initial_gc_memory_size = 16 * (1 << 20) +# The initial size of the GC heap, currently 16 MiB. +const initial_gc_heap_size = 16 * (1 << 20) + +# The default size of a root buffer, i.e., the max number of +# roots that can be stored per thread. Currently set to +# 256 roots. That's 2 KiB of roots per thread. +const default_root_buffer_size = 256 # Initializes GC memory and produces a master record. -function gc_init(buffer::Array{UInt8, 1})::GCMasterRecord - buffer_ptr = pointer(buffer, 1) +function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_size::Integer = default_root_buffer_size)::GCMasterRecord + # Compute the total size of all root buffers. + total_root_buffer_size = sizeof(ObjectRef) * default_root_buffer_size * thread_count + root_buffer_ptr = Base.unsafe_convert(Ptr{ObjectRef}, pointer(buffer, 1)) + + # Compute a pointer to the start of the heap. + heap_start_ptr = pointer(buffer, total_root_buffer_size + 1) + global_arena_size = length(buffer) - total_root_buffer_size - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord) # Create a single free list entry. - first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, buffer_ptr + sizeof(GCArenaRecord)) + first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, heap_start_ptr + sizeof(GCArenaRecord)) unsafe_store!( first_entry_ptr, - GCAllocationRecord( - length(buffer) - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord), - C_NULL)) + GCAllocationRecord(global_arena_size, C_NULL)) # Set up the main GC data structure. - global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, buffer_ptr) + global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, heap_start_ptr) unsafe_store!( global_arena, GCArenaRecord(first_entry_ptr, C_NULL)) - return GCMasterRecord(global_arena) + return GCMasterRecord(global_arena, root_buffer_size, root_buffer_ptr) end # Collects garbage. This function is designed to be called by @@ -282,6 +299,18 @@ function gc_collect_impl(master_record::GCMasterRecord) println("GC collections are not implemented yet.") end +# Examines a keyword argument list and gets either the value +# assigned to a key or a default value. +function get_kwarg_or_default(kwarg_list, key::Symbol, default) + for kwarg in kwarg_list + arg_key, val = kwarg.args + if arg_key == key + return val + end + end + return default +end + """ @cuda_gc [kwargs...] func(args...) @@ -316,13 +345,10 @@ macro cuda_gc(ex...) vars, var_exprs = CUDAnative.assign_args!(code, args) # Find the stream on which the kernel is to be scheduled. - stream = CuDefaultStream() - for kwarg in call_kwargs - key, val = kwarg.args - if key == :stream - stream = val - end - end + stream = get_kwarg_or_default(call_kwargs, :stream, CuDefaultStream()) + + # Get the total number of threads. + thread_count = get_kwarg_or_default(call_kwargs, :threads, 1) # convert the arguments, call the compiler and launch the kernel # while keeping the original arguments alive @@ -333,8 +359,9 @@ macro cuda_gc(ex...) local host_interrupt_array, device_interrupt_buffer = alloc_shared_array((1,), ready) # Allocate a shared buffer for GC memory. - local host_gc_array, device_gc_buffer = alloc_shared_array((initial_gc_memory_size,), UInt8(0)) - local master_record = gc_init(host_gc_array) + local gc_memory_size = initial_gc_heap_size + sizeof(ObjectRef) * default_root_buffer_size * $(esc(thread_count)) + local host_gc_array, device_gc_buffer = alloc_shared_array((gc_memory_size,), UInt8(0)) + local master_record = gc_init(host_gc_array, $(esc(thread_count))) # Define a kernel initialization function. local function kernel_init(kernel) From 24c184fe6bc5e35cd902740739fa5c563b9d76ca Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 13:02:38 +0100 Subject: [PATCH 027/146] Have the GC allocate memory for root buffer sizes --- src/gc.jl | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index b549bad8..013f29e7 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -73,10 +73,16 @@ struct GCMasterRecord # A pointer to the global GC arena. global_arena::Ptr{GCArenaRecord} - # The size of a GC root buffer. - root_buffer_size::Csize_t + # The maximum size of a GC root buffer, i.e., the maximum number + # of roots per thread. + root_buffer_capacity::Csize_t + + # A pointer to a buffer that describes the number of elements + # currently in each root buffer. + root_buffer_sizes::Ptr{Csize_t} # A pointer to a list of buffers that can be used to store GC roots in. + # These root buffers are partitioned into GC frames later on. root_buffers::Ptr{ObjectRef} end @@ -263,20 +269,25 @@ end # The initial size of the GC heap, currently 16 MiB. const initial_gc_heap_size = 16 * (1 << 20) -# The default size of a root buffer, i.e., the max number of +# The default capacity of a root buffer, i.e., the max number of # roots that can be stored per thread. Currently set to # 256 roots. That's 2 KiB of roots per thread. -const default_root_buffer_size = 256 +const default_root_buffer_capacity = 256 # Initializes GC memory and produces a master record. -function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_size::Integer = default_root_buffer_size)::GCMasterRecord - # Compute the total size of all root buffers. - total_root_buffer_size = sizeof(ObjectRef) * default_root_buffer_size * thread_count - root_buffer_ptr = Base.unsafe_convert(Ptr{ObjectRef}, pointer(buffer, 1)) +function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_capacity::Integer = default_root_buffer_capacity)::GCMasterRecord + gc_memory_start_ptr = pointer(buffer, 1) + gc_memory_end_ptr = pointer(buffer, length(buffer)) + + # Set up root buffers. + sizebuf_bytesize = sizeof(Csize_t) * thread_count + sizebuf_ptr = gc_memory_start_ptr + rootbuf_bytesize = sizeof(ObjectRef) * default_root_buffer_capacity * thread_count + rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, sizebuf_ptr + sizebuf_bytesize) # Compute a pointer to the start of the heap. - heap_start_ptr = pointer(buffer, total_root_buffer_size + 1) - global_arena_size = length(buffer) - total_root_buffer_size - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord) + heap_start_ptr = rootbuf_ptr + rootbuf_bytesize + global_arena_size = Csize_t(gc_memory_end_ptr) - Csize_t(heap_start_ptr) - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord) # Create a single free list entry. first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, heap_start_ptr + sizeof(GCArenaRecord)) @@ -290,7 +301,7 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_siz global_arena, GCArenaRecord(first_entry_ptr, C_NULL)) - return GCMasterRecord(global_arena, root_buffer_size, root_buffer_ptr) + return GCMasterRecord(global_arena, root_buffer_capacity, sizebuf_ptr, rootbuf_ptr) end # Collects garbage. This function is designed to be called by @@ -359,7 +370,7 @@ macro cuda_gc(ex...) local host_interrupt_array, device_interrupt_buffer = alloc_shared_array((1,), ready) # Allocate a shared buffer for GC memory. - local gc_memory_size = initial_gc_heap_size + sizeof(ObjectRef) * default_root_buffer_size * $(esc(thread_count)) + local gc_memory_size = initial_gc_heap_size + sizeof(ObjectRef) * default_root_buffer_capacity * $(esc(thread_count)) local host_gc_array, device_gc_buffer = alloc_shared_array((gc_memory_size,), UInt8(0)) local master_record = gc_init(host_gc_array, $(esc(thread_count))) From 33e54b796914411fa1bfe772404bb6ac03287b0f Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 13:17:30 +0100 Subject: [PATCH 028/146] Use 32-bit integers to describe GC root buffer sizes --- src/gc.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 013f29e7..077fd5c2 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -24,7 +24,7 @@ # * When the device runs out of GC memory, it requests an interrupt # to mark and sweep. -export @cuda_gc, gc_malloc +export @cuda_gc, gc_malloc, gc_collect # A data structure that precedes every chunk of memory that has been # allocated or put into the free list. @@ -75,11 +75,11 @@ struct GCMasterRecord # The maximum size of a GC root buffer, i.e., the maximum number # of roots per thread. - root_buffer_capacity::Csize_t + root_buffer_capacity::UInt32 # A pointer to a buffer that describes the number of elements # currently in each root buffer. - root_buffer_sizes::Ptr{Csize_t} + root_buffer_sizes::Ptr{UInt32} # A pointer to a list of buffers that can be used to store GC roots in. # These root buffers are partitioned into GC frames later on. @@ -280,7 +280,7 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_cap gc_memory_end_ptr = pointer(buffer, length(buffer)) # Set up root buffers. - sizebuf_bytesize = sizeof(Csize_t) * thread_count + sizebuf_bytesize = sizeof(Int32) * thread_count sizebuf_ptr = gc_memory_start_ptr rootbuf_bytesize = sizeof(ObjectRef) * default_root_buffer_capacity * thread_count rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, sizebuf_ptr + sizebuf_bytesize) From 71aa78f0d55e90c59ab0a3cd440f747f2a9574bc Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 13:41:17 +0100 Subject: [PATCH 029/146] Define GC frame management functions --- src/gc.jl | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 4 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 077fd5c2..0a1fcf78 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -91,9 +91,94 @@ end return ReaderWriterLock(@cuda_global_ptr("gc_interrupt_lock", ReaderWriterLockState)) end -# Gets a pointer to the GC master record. -@inline function get_gc_master_record()::Ptr{GCMasterRecord} - return @cuda_global_ptr("gc_master_record", GCMasterRecord) +# Runs a function in such a way that no collection phases will +# run as long as the function is executing. Use with care: this +# function acquires the GC interrupt lock in reader mode, so careless +# use may cause deadlocks. +@inline function nocollect(func::Function) + return reader_locked(func, get_interrupt_lock()) +end + +# Gets the GC master record. +@inline function get_gc_master_record()::GCMasterRecord + return unsafe_load(@cuda_global_ptr("gc_master_record", GCMasterRecord)) +end + +# Gets the thread ID of the current thread. +@inline function get_thread_id() + return threadIdx().x +end + +# Gets a pointer to the first element in the root buffer for this thread. +@inline function get_root_buffer_start()::Ptr{ObjectRef} + master_record = get_gc_master_record() + offset = master_record.root_buffer_capacity * get_thread_id() + return master_record.root_buffers + offset * sizeof(ObjectRef) +end + +""" + new_gc_frame(size::UInt32)::Ptr{ObjectRef} + +Allocates a new GC frame. +""" +function new_gc_frame(size::UInt32)::Ptr{ObjectRef} + nocollect() do + master_record = get_gc_master_record() + + # Get the current size of the root buffer. + current_size = unsafe_load( + master_record.root_buffer_sizes, + get_thread_id()) + + # The size of a root buffer should never exceed its capacity. + @cuassert(current_size + size <= master_record.root_buffer_capacity) + + return get_root_buffer_start() + current_size * sizeof(ObjectRef) + end +end + +""" + push_gc_frame(size::UInt32) + +Registers a GC frame with the garbage collector. +""" +function push_gc_frame(size::UInt32) + nocollect() do + master_record = get_gc_master_record() + + # Get the current size of the root buffer. + current_size = unsafe_load( + master_record.root_buffer_sizes, + get_thread_id()) + + # Add the new size to the current root buffer size. + unsafe_store!( + master_record.root_buffer_sizes, + current_size + size, + get_thread_id()) + end +end + +""" + pop_gc_frame(size::UInt32) + +Deregisters a GC frame. +""" +function pop_gc_frame(size::UInt32) + nocollect() do + master_record = get_gc_master_record() + + # Get the current size of the root buffer. + current_size = unsafe_load( + master_record.root_buffer_sizes, + get_thread_id()) + + # Subtract the size from the current root buffer size. + unsafe_store!( + master_record.root_buffer_sizes, + current_size - size, + get_thread_id()) + end end const gc_align = Csize_t(16) @@ -230,7 +315,7 @@ Allocates a blob of memory that is managed by the garbage collector. This function is designed to be called by the device. """ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} - master_record = unsafe_load(get_gc_master_record()) + master_record = get_gc_master_record() # Try to malloc the object without host intervention. ptr = gc_malloc_local(master_record.global_arena, bytesize) From da046af042903d6d9151086b83ce6a58391f9905 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 17:42:59 +0100 Subject: [PATCH 030/146] Make globals created by 'get_global_pointer' 'linkonce_odr' --- src/interrupts.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/interrupts.jl b/src/interrupts.jl index 333545bf..d70c2773 100644 --- a/src/interrupts.jl +++ b/src/interrupts.jl @@ -63,15 +63,16 @@ end mod = LLVM.parent(llvm_f) # Figure out if the global has been defined already. - globalSet = LLVM.globals(mod) + global_set = LLVM.globals(mod) global_name_string = String(global_name) - if haskey(globalSet, global_name_string) - global_var = globalSet[global_name_string] + if haskey(global_set, global_name_string) + global_var = global_set[global_name_string] else # If the global hasn't been defined already, then we'll define # it in the global address space, i.e., address space one. global_var = GlobalVariable(mod, T_global, global_name_string, 1) - LLVM.initializer!(global_var, LLVM.null(T_global)) + linkage!(global_var, LLVM.API.LLVMLinkOnceODRLinkage) + initializer!(global_var, LLVM.null(T_global)) end # Generate IR that computes the global's address. From 159acd384e46d561634de5c6fb0e087c200ce0ed Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 17:43:19 +0100 Subject: [PATCH 031/146] Protect newly allocated objects from collection --- src/gc.jl | 72 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 25 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 0a1fcf78..9c425b09 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -56,6 +56,9 @@ end # a section of the heap that is managed by the GC. Every arena # has its own free list and allocation list. struct GCArenaRecord + # The allocation lock for the arena. + lock_state::ReaderWriterLockState + # The head of the free list. free_list_head::Ptr{GCAllocationRecord} @@ -93,10 +96,16 @@ end # Runs a function in such a way that no collection phases will # run as long as the function is executing. Use with care: this -# function acquires the GC interrupt lock in reader mode, so careless +# macro acquires the GC interrupt lock in reader mode, so careless # use may cause deadlocks. -@inline function nocollect(func::Function) - return reader_locked(func, get_interrupt_lock()) +macro nocollect(func) + quote + local @inline function lock_callback() + $(esc(func)) + end + + reader_locked(lock_callback, get_interrupt_lock()) + end end # Gets the GC master record. @@ -116,25 +125,25 @@ end return master_record.root_buffers + offset * sizeof(ObjectRef) end +# Same as 'new_gc_frame_impl', but does not disable collections. +function new_gc_frame_impl(size::UInt32)::Ptr{ObjectRef} + master_record = get_gc_master_record() + + # Get the current size of the root buffer. + current_size = unsafe_load( + master_record.root_buffer_sizes, + get_thread_id()) + + return get_root_buffer_start() + current_size * sizeof(ObjectRef) +end + """ new_gc_frame(size::UInt32)::Ptr{ObjectRef} Allocates a new GC frame. """ function new_gc_frame(size::UInt32)::Ptr{ObjectRef} - nocollect() do - master_record = get_gc_master_record() - - # Get the current size of the root buffer. - current_size = unsafe_load( - master_record.root_buffer_sizes, - get_thread_id()) - - # The size of a root buffer should never exceed its capacity. - @cuassert(current_size + size <= master_record.root_buffer_capacity) - - return get_root_buffer_start() + current_size * sizeof(ObjectRef) - end + @nocollect new_gc_frame_impl(size) end """ @@ -143,7 +152,7 @@ end Registers a GC frame with the garbage collector. """ function push_gc_frame(size::UInt32) - nocollect() do + @nocollect begin master_record = get_gc_master_record() # Get the current size of the root buffer. @@ -165,7 +174,7 @@ end Deregisters a GC frame. """ function pop_gc_frame(size::UInt32) - nocollect() do + @nocollect begin master_record = get_gc_master_record() # Get the current size of the root buffer. @@ -299,12 +308,25 @@ end # Returns a null pointer if no sufficiently large chunk of # memory can be found. function gc_malloc_local(arena::Ptr{GCArenaRecord}, bytesize::Csize_t)::Ptr{UInt8} - # TODO: reader-lock on the interrupt lock and writer-lock on the GC's - # lock. - writer_locked(get_interrupt_lock()) do - free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}} - allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}} - return gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize) + # Disable collections and acquire the arena's lock. + @nocollect begin + arena_lock = ReaderWriterLock(@get_field_pointer(arena, :lock_state)) + result_ptr = writer_locked(arena_lock) do + # Allocate a suitable region of memory. + free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}} + allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}} + gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize) + end + + # If the resulting pointer is non-null, then we'll write it to a temporary GC frame. + # Our reasoning for doing this is that doing so ensures that the allocated memory + # won't get collected by the GC before the caller has a chance to add it to its + # own GC frame. + if result_ptr != Base.unsafe_convert(Ptr{UInt8}, C_NULL) + gc_frame = new_gc_frame_impl(UInt32(1)) + unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, result_ptr)) + end + return result_ptr end end @@ -384,7 +406,7 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_cap global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, heap_start_ptr) unsafe_store!( global_arena, - GCArenaRecord(first_entry_ptr, C_NULL)) + GCArenaRecord(0, first_entry_ptr, C_NULL)) return GCMasterRecord(global_arena, root_buffer_capacity, sizebuf_ptr, rootbuf_ptr) end From 2b772287fa1785f5e12648bfe5a3a31269283541 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 17:54:17 +0100 Subject: [PATCH 032/146] Introduce a separate GPU GC lowering pass --- src/compiler/common.jl | 10 +++- src/compiler/optim.jl | 123 +++++++++++++++++++++++++++++++++++++++-- src/gc.jl | 2 +- 3 files changed, 128 insertions(+), 7 deletions(-) diff --git a/src/compiler/common.jl b/src/compiler/common.jl index b9160a5f..33232b82 100644 --- a/src/compiler/common.jl +++ b/src/compiler/common.jl @@ -12,11 +12,17 @@ struct CompilerJob maxthreads::Union{Nothing,CuDim} blocks_per_sm::Union{Nothing,Integer} maxregs::Union{Nothing,Integer} + # Indicates whether the GPU GC or the "malloc never free" + # GC intrinsic lowering strategy is to be used. The former + # is used when this field is `true`; the latter when it is + # `false`. + gc::Bool CompilerJob(f, tt, cap, kernel; minthreads=nothing, maxthreads=nothing, - blocks_per_sm=nothing, maxregs=nothing) = - new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs) + blocks_per_sm=nothing, maxregs=nothing, + gc=false) = + new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, gc) end # global job reference diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 55b16ca6..70903a86 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -67,6 +67,14 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) # PTX-specific optimizations ModulePassManager() do pm initialize!(pm) + # lower intrinsics + if ctx.gc + add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!)) + else + add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!)) + end + aggressive_dce!(pm) # remove dead uses of ptls + add!(pm, ModulePass("LowerPTLS", lower_ptls!)) # NVPTX's target machine info enables runtime unrolling, # but Julia's pass sequence only invokes the simple unroller. @@ -379,10 +387,117 @@ function eager_lower_gc_frame!(fun::LLVM.Function) return changed end -# Lowers the GC intrinsics produced by the LateLowerGCFrame pass. These -# intrinsics are the last point at which we can intervene in the pipeline -# before the passes that deal with them become CPU-specific. -function lower_final_gc_intrinsics!(mod::LLVM.Module) +# Lowers the GC intrinsics produced by the LateLowerGCFrame pass to +# use the "malloc, never free" strategy. These intrinsics are the +# last point at which we can intervene in the pipeline before the +# passes that deal with them become CPU-specific. +function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module) + changed = false + + # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates + # store for an object, including headroom, but does not set the object's + # tag. + visit_calls_to("julia.gc_alloc_bytes", mod) do call, gc_alloc_bytes + gc_alloc_bytes_ft = eltype(llvmtype(gc_alloc_bytes))::LLVM.FunctionType + T_ret = return_type(gc_alloc_bytes_ft)::LLVM.PointerType + T_bitcast = LLVM.PointerType(T_ret, LLVM.addrspace(T_ret)) + + # Decode the call. + ops = collect(operands(call)) + size = ops[2] + + # We need to reserve a single pointer of headroom for the tag. + # (LateLowerGCFrame depends on us doing that.) + headroom = Runtime.tag_size + + # Call the allocation function and bump the resulting pointer + # so the headroom sits just in front of the returned pointer. + let builder = Builder(JuliaContext()) + position!(builder, call) + total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext())) + ptr = call!(builder, Runtime.get(:gc_pool_alloc), [total_size]) + cast_ptr = bitcast!(builder, ptr, T_bitcast) + bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())]) + replace_uses!(call, bumped_ptr) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + + changed = true + end + + # Next up: 'julia.new_gc_frame'. This intrinsic allocates a new GC frame. + # We'll lower it as an alloca and hope SSA construction and DCE passes + # get rid of the alloca. This is a reasonable thing to hope for because + # all intrinsics that may cause the GC frame to escape will be replaced by + # nops. + visit_calls_to("julia.new_gc_frame", mod) do call, new_gc_frame + new_gc_frame_ft = eltype(llvmtype(new_gc_frame))::LLVM.FunctionType + T_ret = return_type(new_gc_frame_ft)::LLVM.PointerType + T_alloca = eltype(T_ret) + + # Decode the call. + ops = collect(operands(call)) + size = ops[1] + + # Call the allocation function and bump the resulting pointer + # so the headroom sits just in front of the returned pointer. + let builder = Builder(JuliaContext()) + position!(builder, call) + ptr = array_alloca!(builder, T_alloca, size) + replace_uses!(call, ptr) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + + changed = true + end + + # The 'julia.get_gc_frame_slot' is closely related to the previous + # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of + # a slot in the GC frame. We can simply turn this intrinsic into a GEP. + visit_calls_to("julia.get_gc_frame_slot", mod) do call, _ + # Decode the call. + ops = collect(operands(call)) + frame = ops[1] + offset = ops[2] + + # Call the allocation function and bump the resulting pointer + # so the headroom sits just in front of the returned pointer. + let builder = Builder(JuliaContext()) + position!(builder, call) + ptr = gep!(builder, frame, [offset]) + replace_uses!(call, ptr) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + + changed = true + end + + # The 'julia.push_gc_frame' registers a GC frame with the GC. We + # don't have a GC, so we can just delete calls to this intrinsic! + changed |= delete_calls_to!("julia.push_gc_frame", mod) + + # The 'julia.pop_gc_frame' unregisters a GC frame with the GC, so + # we can just delete calls to this intrinsic, too. + changed |= delete_calls_to!("julia.pop_gc_frame", mod) + + # Ditto for 'julia.queue_gc_root'. + changed |= delete_calls_to!("julia.queue_gc_root", mod) + + return changed +end + +""" +lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) + +An LLVM pass that lowers the GC intrinsics produced by the +LateLowerGCFrame pass to use the GPU GC. These intrinsics are the +last point at which we can intervene in the pipeline before the +passes that deal with them become CPU-specific. +""" +function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) changed = false # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates diff --git a/src/gc.jl b/src/gc.jl index 9c425b09..32de5311 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -516,7 +516,7 @@ macro cuda_gc(ex...) # Standard kernel setup logic. local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),)) local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} - local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) + local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; gc = true, $(map(esc, compiler_kwargs)...)) CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...)) kernel(kernel_args...; $(map(esc, call_kwargs)...)) From 9a3da04a4cd6347510b9b15bf2a459c304e28f5d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 18:28:44 +0100 Subject: [PATCH 033/146] Use 'gc_malloc' instead of regular 'malloc' when in GC mode --- src/compiler/optim.jl | 8 +++++--- src/device/runtime.jl | 11 +++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 70903a86..ef42d155 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -418,7 +418,8 @@ function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module) ptr = call!(builder, Runtime.get(:gc_pool_alloc), [total_size]) cast_ptr = bitcast!(builder, ptr, T_bitcast) bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())]) - replace_uses!(call, bumped_ptr) + result_ptr = bitcast!(builder, bumped_ptr, T_ret) + replace_uses!(call, result_ptr) unsafe_delete!(LLVM.parent(call), call) dispose(builder) end @@ -521,10 +522,11 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) let builder = Builder(JuliaContext()) position!(builder, call) total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext())) - ptr = call!(builder, Runtime.get(:gc_pool_alloc), [total_size]) + ptr = call!(builder, Runtime.get(:gc_malloc_object), [total_size]) cast_ptr = bitcast!(builder, ptr, T_bitcast) bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())]) - replace_uses!(call, bumped_ptr) + result_ptr = bitcast!(builder, bumped_ptr, T_ret) + replace_uses!(call, result_ptr) unsafe_delete!(LLVM.parent(call), call) dispose(builder) end diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 1bf9fa5e..a331c9ee 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -225,5 +225,16 @@ for (T, t) in [Int8 => :int8, Int16 => :int16, Int32 => :int32, Int64 => end end +""" + gc_malloc_object(bytesize::Csize_t) + +Allocates an object that is managed by the garbage collector. +This function is designed to be called by the device. +""" +function gc_malloc_object(bytesize::Csize_t) + return unsafe_pointer_to_objref(gc_malloc(bytesize)) +end + +compile(gc_malloc_object, Any, (Csize_t,), T_prjlvalue) end From 5bd8da4d10496b9057f0cfdeb0973aaa85a63b3f Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 18:43:02 +0100 Subject: [PATCH 034/146] Use pointers instead of integers to keep track of GC frames --- src/gc.jl | 71 ++++++++++++++++++++++--------------------------------- 1 file changed, 28 insertions(+), 43 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 32de5311..660614c3 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -80,9 +80,9 @@ struct GCMasterRecord # of roots per thread. root_buffer_capacity::UInt32 - # A pointer to a buffer that describes the number of elements - # currently in each root buffer. - root_buffer_sizes::Ptr{UInt32} + # A pointer to a list of root buffer pointers that point to the + # end of the root buffer for every thread. + root_buffer_fingers::Ptr{Ptr{ObjectRef}} # A pointer to a list of buffers that can be used to store GC roots in. # These root buffers are partitioned into GC frames later on. @@ -118,23 +118,13 @@ end return threadIdx().x end -# Gets a pointer to the first element in the root buffer for this thread. -@inline function get_root_buffer_start()::Ptr{ObjectRef} - master_record = get_gc_master_record() - offset = master_record.root_buffer_capacity * get_thread_id() - return master_record.root_buffers + offset * sizeof(ObjectRef) -end +const GCFrame = Ptr{ObjectRef} # Same as 'new_gc_frame_impl', but does not disable collections. -function new_gc_frame_impl(size::UInt32)::Ptr{ObjectRef} +function new_gc_frame_impl(size::UInt32)::GCFrame master_record = get_gc_master_record() - - # Get the current size of the root buffer. - current_size = unsafe_load( - master_record.root_buffer_sizes, - get_thread_id()) - - return get_root_buffer_start() + current_size * sizeof(ObjectRef) + # Return the root buffer tip: that's where the new GC frame starts. + return unsafe_load(master_record.root_buffer_fingers, get_thread_id()) end """ @@ -142,50 +132,40 @@ end Allocates a new GC frame. """ -function new_gc_frame(size::UInt32)::Ptr{ObjectRef} +function new_gc_frame(size::UInt32)::GCFrame @nocollect new_gc_frame_impl(size) end """ - push_gc_frame(size::UInt32) + push_gc_frame(gc_frame::GCFrame, size::UInt32) Registers a GC frame with the garbage collector. """ -function push_gc_frame(size::UInt32) +function push_gc_frame(gc_frame::GCFrame, size::UInt32) @nocollect begin master_record = get_gc_master_record() - # Get the current size of the root buffer. - current_size = unsafe_load( - master_record.root_buffer_sizes, - get_thread_id()) - - # Add the new size to the current root buffer size. + # Update the root buffer tip. unsafe_store!( - master_record.root_buffer_sizes, - current_size + size, + master_record.root_buffer_fingers, + gc_frame + size * sizeof(ObjectRef), get_thread_id()) end end """ - pop_gc_frame(size::UInt32) + pop_gc_frame(gc_frame::GCFrame) Deregisters a GC frame. """ -function pop_gc_frame(size::UInt32) +function pop_gc_frame(gc_frame::GCFrame) @nocollect begin master_record = get_gc_master_record() - # Get the current size of the root buffer. - current_size = unsafe_load( - master_record.root_buffer_sizes, - get_thread_id()) - - # Subtract the size from the current root buffer size. + # Update the root buffer tip. unsafe_store!( - master_record.root_buffer_sizes, - current_size - size, + master_record.root_buffer_fingers, + gc_frame, get_thread_id()) end end @@ -387,10 +367,15 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_cap gc_memory_end_ptr = pointer(buffer, length(buffer)) # Set up root buffers. - sizebuf_bytesize = sizeof(Int32) * thread_count - sizebuf_ptr = gc_memory_start_ptr - rootbuf_bytesize = sizeof(ObjectRef) * default_root_buffer_capacity * thread_count - rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, sizebuf_ptr + sizebuf_bytesize) + fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count + fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, gc_memory_start_ptr) + rootbuf_bytesize = sizeof(ObjectRef) * root_buffer_capacity * thread_count + rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, fingerbuf_ptr + fingerbuf_bytesize) + + # Populate the root buffer fingers. + for i in 1:thread_count + unsafe_store!(fingerbuf_ptr, rootbuf_ptr + i * sizeof(ObjectRef) * root_buffer_capacity, i) + end # Compute a pointer to the start of the heap. heap_start_ptr = rootbuf_ptr + rootbuf_bytesize @@ -408,7 +393,7 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_cap global_arena, GCArenaRecord(0, first_entry_ptr, C_NULL)) - return GCMasterRecord(global_arena, root_buffer_capacity, sizebuf_ptr, rootbuf_ptr) + return GCMasterRecord(global_arena, root_buffer_capacity, fingerbuf_ptr, rootbuf_ptr) end # Collects garbage. This function is designed to be called by From f560be645f6a1ec5bb681c2c3a2c3f3443289ca8 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 6 Mar 2019 20:59:41 +0100 Subject: [PATCH 035/146] Lower GC frame management intrinsics to GPU GC calls --- src/CUDAnative.jl | 11 ++++--- src/compiler/driver.jl | 6 ++-- src/compiler/optim.jl | 71 ++++++++++++++++++------------------------ src/compiler/rtlib.jl | 6 +++- src/device/runtime.jl | 69 ++++++++++++++++++++++++++++++++++++++++ src/gc.jl | 50 +++-------------------------- src/interrupts.jl | 2 +- 7 files changed, 119 insertions(+), 96 deletions(-) diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index 38d6dd3c..85ea5ef9 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -33,15 +33,18 @@ include(joinpath("device", "llvm.jl")) include(joinpath("device", "runtime.jl")) include(joinpath("device", "libdevice.jl")) include(joinpath("device", "cuda_intrinsics.jl")) -include(joinpath("device", "runtime_intrinsics.jl")) include(joinpath("device", "threading.jl")) -include("compiler.jl") -include("execution.jl") +# The interrupts and GC files need to be loaded _before_ the +# runtime intrinsics file, because some runtime intrinsics +# depend on the GC and the GC depends on interrupts. include("interrupts.jl") include("gc.jl") -include("reflection.jl") +include(joinpath("device", "runtime_intrinsics.jl")) +include("compiler.jl") +include("execution.jl") +include("reflection.jl") include("deprecated.jl") include("init.jl") diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index d18bb32f..70ea33ba 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -39,7 +39,7 @@ function compile(target::Symbol, job::CompilerJob; end function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true, - optimize::Bool=true, strip::Bool=false) + optimize::Bool=true, strip::Bool=false, internalize::Bool=true) ## Julia IR @timeit to[] "Julia front-end" begin @@ -86,7 +86,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true, end if optimize - kernel = @timeit to[] "optimization" optimize!(job, ir, kernel) + kernel = @timeit to[] "optimization" optimize!(job, ir, kernel; internalize=internalize) end if libraries @@ -138,7 +138,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true, for dyn_job in keys(worklist) # cached compilation dyn_kernel_fn = get!(kernels, dyn_job) do - dyn_ir, dyn_kernel = codegen(:llvm, dyn_job; optimize=optimize, strip=strip) + dyn_ir, dyn_kernel = codegen(:llvm, dyn_job; optimize=optimize, strip=strip, internalize=internalize) dyn_kernel_fn = LLVM.name(dyn_kernel) dyn_kernel_ft = eltype(llvmtype(dyn_kernel)) link!(ir, dyn_ir) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index ef42d155..4067f518 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -1,6 +1,6 @@ # LLVM IR optimization -function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) +function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; internalize::Bool=true) tm = machine(job.cap, triple(mod)) if job.kernel @@ -10,7 +10,9 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) function initialize!(pm) add_library_info!(pm, triple(mod)) add_transform_info!(pm, tm) - internalize!(pm, [LLVM.name(entry)]) + if internalize + internalize!(pm, [LLVM.name(entry)]) + end end global current_job @@ -327,7 +329,6 @@ function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool return changed end - # lower object allocations to to PTX malloc # # this is a PoC implementation that is very simple: allocate, and never free. it also runs @@ -383,7 +384,24 @@ function eager_lower_gc_frame!(fun::LLVM.Function) @compiler_assert isempty(uses(barrier)) job end +end +# Visits all calls to a particular intrinsic in a given LLVM module +# and redirects those calls to a different function. +# Returns a Boolean that tells if any calls were actually redirected. +function redirect_calls_to!(from::AbstractString, to, mod::LLVM.Module)::Bool + changed = false + visit_calls_to(from, mod) do call, _ + args = collect(operands(call))[1:end-1] + let builder = Builder(JuliaContext()) + position!(builder, call) + new_call = call!(builder, to, args) + replace_uses!(call, new_call) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + changed = true + end return changed end @@ -441,8 +459,6 @@ function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module) ops = collect(operands(call)) size = ops[1] - # Call the allocation function and bump the resulting pointer - # so the headroom sits just in front of the returned pointer. let builder = Builder(JuliaContext()) position!(builder, call) ptr = array_alloca!(builder, T_alloca, size) @@ -463,8 +479,6 @@ function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module) frame = ops[1] offset = ops[2] - # Call the allocation function and bump the resulting pointer - # so the headroom sits just in front of the returned pointer. let builder = Builder(JuliaContext()) position!(builder, call) ptr = gep!(builder, frame, [offset]) @@ -535,31 +549,8 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) end # Next up: 'julia.new_gc_frame'. This intrinsic allocates a new GC frame. - # We'll lower it as an alloca and hope SSA construction and DCE passes - # get rid of the alloca. This is a reasonable thing to hope for because - # all intrinsics that may cause the GC frame to escape will be replaced by - # nops. - visit_calls_to("julia.new_gc_frame", mod) do call, new_gc_frame - new_gc_frame_ft = eltype(llvmtype(new_gc_frame))::LLVM.FunctionType - T_ret = return_type(new_gc_frame_ft)::LLVM.PointerType - T_alloca = eltype(T_ret) - - # Decode the call. - ops = collect(operands(call)) - size = ops[1] - - # Call the allocation function and bump the resulting pointer - # so the headroom sits just in front of the returned pointer. - let builder = Builder(JuliaContext()) - position!(builder, call) - ptr = array_alloca!(builder, T_alloca, size) - replace_uses!(call, ptr) - unsafe_delete!(LLVM.parent(call), call) - dispose(builder) - end - - changed = true - end + # We actually have a call that implements this intrinsic. Let's use that. + changed |= redirect_calls_to!("julia.new_gc_frame", Runtime.get(:new_gc_frame), mod) # The 'julia.get_gc_frame_slot' is closely related to the previous # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of @@ -570,8 +561,6 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) frame = ops[1] offset = ops[2] - # Call the allocation function and bump the resulting pointer - # so the headroom sits just in front of the returned pointer. let builder = Builder(JuliaContext()) position!(builder, call) ptr = gep!(builder, frame, [offset]) @@ -583,15 +572,15 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) changed = true end - # The 'julia.push_gc_frame' registers a GC frame with the GC. We - # don't have a GC, so we can just delete calls to this intrinsic! - changed |= delete_calls_to!("julia.push_gc_frame", mod) + # The 'julia.push_gc_frame' registers a GC frame with the GC. We will + # call a function that does just this. + changed |= redirect_calls_to!("julia.push_gc_frame", Runtime.get(:push_gc_frame), mod) - # The 'julia.pop_gc_frame' unregisters a GC frame with the GC, so - # we can just delete calls to this intrinsic, too. - changed |= delete_calls_to!("julia.pop_gc_frame", mod) + # The 'julia.pop_gc_frame' unregisters a GC frame with the GC. We again + # have a function in the runtime library. + changed |= redirect_calls_to!("julia.pop_gc_frame", Runtime.get(:pop_gc_frame), mod) - # Ditto for 'julia.queue_gc_root'. + # Delete calls to 'julia.queue_gc_root'. changed |= delete_calls_to!("julia.queue_gc_root", mod) return changed diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl index 385ac218..ad82f984 100644 --- a/src/compiler/rtlib.jl +++ b/src/compiler/rtlib.jl @@ -124,8 +124,12 @@ end function emit_function!(mod, cap, f, types, name) tt = Base.to_tuple_type(types) + # Optimize the module that defines the function, but don't + # internalize symbols in that function yet: internalizing + # globals may de-alias references to globals in the runtime + # library from equivalent references in the kernel. new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false); - libraries=false) + libraries=false, internalize=false) LLVM.name!(entry, name) link!(mod, new_mod) end diff --git a/src/device/runtime.jl b/src/device/runtime.jl index a331c9ee..a8ff03a6 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -13,6 +13,7 @@ using ..CUDAnative using LLVM using LLVM.Interop +import ..CUDAnative: @nocollect, ObjectRef, GCFrame, get_gc_master_record, get_thread_id, new_gc_frame_impl ## representation of a runtime method instance @@ -225,6 +226,13 @@ for (T, t) in [Int8 => :int8, Int16 => :int16, Int32 => :int32, Int64 => end end +# LLVM type of a pointer to a tracked pointer +function T_pprjlvalue() + T_pjlvalue = convert(LLVMType, Any, true) + LLVM.PointerType( + LLVM.PointerType(eltype(T_pjlvalue), Tracked)) +end + """ gc_malloc_object(bytesize::Csize_t) @@ -237,4 +245,65 @@ end compile(gc_malloc_object, Any, (Csize_t,), T_prjlvalue) +""" + new_gc_frame(size::UInt32)::GCFrame + +Allocates a new GC frame. +""" +function new_gc_frame(size::UInt32)::GCFrame + @nocollect new_gc_frame_impl(size) +end + +compile(new_gc_frame, Any, (Cuint,), T_pprjlvalue) + +""" + push_gc_frame(gc_frame::GCFrame, size::UInt32) + +Registers a GC frame with the garbage collector. +""" +function push_gc_frame(gc_frame::GCFrame, size::UInt32) + @nocollect begin + master_record = get_gc_master_record() + + # Update the root buffer tip. + unsafe_store!( + master_record.root_buffer_fingers, + gc_frame + size * sizeof(ObjectRef), + get_thread_id()) + return + end +end + +compile( + push_gc_frame, + Nothing, + (GCFrame, Cuint), + () -> convert(LLVMType, Cvoid), + () -> [T_pprjlvalue(), convert(LLVMType, UInt32)]) + +""" + pop_gc_frame(gc_frame::GCFrame) + +Deregisters a GC frame. +""" +function pop_gc_frame(gc_frame::GCFrame) + @nocollect begin + master_record = get_gc_master_record() + + # Update the root buffer tip. + unsafe_store!( + master_record.root_buffer_fingers, + gc_frame, + get_thread_id()) + return + end +end + +compile( + pop_gc_frame, + Nothing, + (GCFrame,), + () -> convert(LLVMType, Cvoid), + () -> [T_pprjlvalue()]) + end diff --git a/src/gc.jl b/src/gc.jl index 660614c3..36937b32 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -69,6 +69,9 @@ end # A reference to a Julia object. const ObjectRef = Ptr{Nothing} +# A GC frame is just a pointer to an array of Julia objects. +const GCFrame = Ptr{ObjectRef} + # A data structure that contains global GC info. This data # structure is designed to be immutable: it should not be changed # once the host has set it up. @@ -118,58 +121,13 @@ end return threadIdx().x end -const GCFrame = Ptr{ObjectRef} - -# Same as 'new_gc_frame_impl', but does not disable collections. +# Same as 'new_gc_frame', but does not disable collections. function new_gc_frame_impl(size::UInt32)::GCFrame master_record = get_gc_master_record() # Return the root buffer tip: that's where the new GC frame starts. return unsafe_load(master_record.root_buffer_fingers, get_thread_id()) end -""" - new_gc_frame(size::UInt32)::Ptr{ObjectRef} - -Allocates a new GC frame. -""" -function new_gc_frame(size::UInt32)::GCFrame - @nocollect new_gc_frame_impl(size) -end - -""" - push_gc_frame(gc_frame::GCFrame, size::UInt32) - -Registers a GC frame with the garbage collector. -""" -function push_gc_frame(gc_frame::GCFrame, size::UInt32) - @nocollect begin - master_record = get_gc_master_record() - - # Update the root buffer tip. - unsafe_store!( - master_record.root_buffer_fingers, - gc_frame + size * sizeof(ObjectRef), - get_thread_id()) - end -end - -""" - pop_gc_frame(gc_frame::GCFrame) - -Deregisters a GC frame. -""" -function pop_gc_frame(gc_frame::GCFrame) - @nocollect begin - master_record = get_gc_master_record() - - # Update the root buffer tip. - unsafe_store!( - master_record.root_buffer_fingers, - gc_frame, - get_thread_id()) - end -end - const gc_align = Csize_t(16) # Aligns a pointer to an alignment boundary. diff --git a/src/interrupts.jl b/src/interrupts.jl index d70c2773..be60697e 100644 --- a/src/interrupts.jl +++ b/src/interrupts.jl @@ -71,7 +71,7 @@ end # If the global hasn't been defined already, then we'll define # it in the global address space, i.e., address space one. global_var = GlobalVariable(mod, T_global, global_name_string, 1) - linkage!(global_var, LLVM.API.LLVMLinkOnceODRLinkage) + linkage!(global_var, LLVM.API.LLVMLinkOnceAnyLinkage) initializer!(global_var, LLVM.null(T_global)) end From 53db509d379308256b1d0d2ee1e24e441b2fd5d8 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 7 Mar 2019 11:29:41 +0100 Subject: [PATCH 036/146] Allow GC frame management functions to execute concurrently with the GC --- src/compiler/optim.jl | 2 +- src/device/runtime.jl | 38 +++++++++++++++++--------------------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 4067f518..f798f96e 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -392,7 +392,7 @@ end function redirect_calls_to!(from::AbstractString, to, mod::LLVM.Module)::Bool changed = false visit_calls_to(from, mod) do call, _ - args = collect(operands(call))[1:end-1] + args = collect(operands(call))[1:end - 1] let builder = Builder(JuliaContext()) position!(builder, call) new_call = call!(builder, to, args) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index a8ff03a6..91dab063 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -251,7 +251,7 @@ compile(gc_malloc_object, Any, (Csize_t,), T_prjlvalue) Allocates a new GC frame. """ function new_gc_frame(size::UInt32)::GCFrame - @nocollect new_gc_frame_impl(size) + new_gc_frame_impl(size) end compile(new_gc_frame, Any, (Cuint,), T_pprjlvalue) @@ -262,16 +262,14 @@ compile(new_gc_frame, Any, (Cuint,), T_pprjlvalue) Registers a GC frame with the garbage collector. """ function push_gc_frame(gc_frame::GCFrame, size::UInt32) - @nocollect begin - master_record = get_gc_master_record() - - # Update the root buffer tip. - unsafe_store!( - master_record.root_buffer_fingers, - gc_frame + size * sizeof(ObjectRef), - get_thread_id()) - return - end + master_record = get_gc_master_record() + + # Update the root buffer tip. + unsafe_store!( + master_record.root_buffer_fingers, + gc_frame + size * sizeof(ObjectRef), + get_thread_id()) + return end compile( @@ -287,16 +285,14 @@ compile( Deregisters a GC frame. """ function pop_gc_frame(gc_frame::GCFrame) - @nocollect begin - master_record = get_gc_master_record() - - # Update the root buffer tip. - unsafe_store!( - master_record.root_buffer_fingers, - gc_frame, - get_thread_id()) - return - end + master_record = get_gc_master_record() + + # Update the root buffer tip. + unsafe_store!( + master_record.root_buffer_fingers, + gc_frame, + get_thread_id()) + return end compile( From 358ceaea49133265796c28fb6a644234fd480aa3 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 7 Mar 2019 11:46:42 +0100 Subject: [PATCH 037/146] Move GC frame management functions into 'gc.jl' --- src/device/runtime.jl | 51 +++++-------------------------------------- src/gc.jl | 42 ++++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 49 deletions(-) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 91dab063..e456c356 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -13,8 +13,7 @@ using ..CUDAnative using LLVM using LLVM.Interop -import ..CUDAnative: @nocollect, ObjectRef, GCFrame, get_gc_master_record, get_thread_id, new_gc_frame_impl - +import ..CUDAnative: GCFrame ## representation of a runtime method instance struct RuntimeMethodInstance @@ -245,58 +244,18 @@ end compile(gc_malloc_object, Any, (Csize_t,), T_prjlvalue) -""" - new_gc_frame(size::UInt32)::GCFrame - -Allocates a new GC frame. -""" -function new_gc_frame(size::UInt32)::GCFrame - new_gc_frame_impl(size) -end - -compile(new_gc_frame, Any, (Cuint,), T_pprjlvalue) - -""" - push_gc_frame(gc_frame::GCFrame, size::UInt32) - -Registers a GC frame with the garbage collector. -""" -function push_gc_frame(gc_frame::GCFrame, size::UInt32) - master_record = get_gc_master_record() - - # Update the root buffer tip. - unsafe_store!( - master_record.root_buffer_fingers, - gc_frame + size * sizeof(ObjectRef), - get_thread_id()) - return -end +# Include GC frame management functions into the runtime. +compile(CUDAnative.new_gc_frame, Any, (Cuint,), T_pprjlvalue) compile( - push_gc_frame, + CUDAnative.push_gc_frame, Nothing, (GCFrame, Cuint), () -> convert(LLVMType, Cvoid), () -> [T_pprjlvalue(), convert(LLVMType, UInt32)]) -""" - pop_gc_frame(gc_frame::GCFrame) - -Deregisters a GC frame. -""" -function pop_gc_frame(gc_frame::GCFrame) - master_record = get_gc_master_record() - - # Update the root buffer tip. - unsafe_store!( - master_record.root_buffer_fingers, - gc_frame, - get_thread_id()) - return -end - compile( - pop_gc_frame, + CUDAnative.pop_gc_frame, Nothing, (GCFrame,), () -> convert(LLVMType, Cvoid), diff --git a/src/gc.jl b/src/gc.jl index 36937b32..61b12f86 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -121,13 +121,49 @@ end return threadIdx().x end -# Same as 'new_gc_frame', but does not disable collections. -function new_gc_frame_impl(size::UInt32)::GCFrame +""" + new_gc_frame(size::UInt32)::GCFrame + +Allocates a new GC frame. +""" +function new_gc_frame(size::UInt32)::GCFrame master_record = get_gc_master_record() # Return the root buffer tip: that's where the new GC frame starts. return unsafe_load(master_record.root_buffer_fingers, get_thread_id()) end +""" + push_gc_frame(gc_frame::GCFrame, size::UInt32) + +Registers a GC frame with the garbage collector. +""" +function push_gc_frame(gc_frame::GCFrame, size::UInt32) + master_record = get_gc_master_record() + + # Update the root buffer tip. + unsafe_store!( + master_record.root_buffer_fingers, + gc_frame + size * sizeof(ObjectRef), + get_thread_id()) + return +end + +""" + pop_gc_frame(gc_frame::GCFrame) + +Deregisters a GC frame. +""" +function pop_gc_frame(gc_frame::GCFrame) + master_record = get_gc_master_record() + + # Update the root buffer tip. + unsafe_store!( + master_record.root_buffer_fingers, + gc_frame, + get_thread_id()) + return +end + const gc_align = Csize_t(16) # Aligns a pointer to an alignment boundary. @@ -261,7 +297,7 @@ function gc_malloc_local(arena::Ptr{GCArenaRecord}, bytesize::Csize_t)::Ptr{UInt # won't get collected by the GC before the caller has a chance to add it to its # own GC frame. if result_ptr != Base.unsafe_convert(Ptr{UInt8}, C_NULL) - gc_frame = new_gc_frame_impl(UInt32(1)) + gc_frame = new_gc_frame(UInt32(1)) unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, result_ptr)) end return result_ptr From ecc601d709feb878c72df688fc09324d43b0ebad Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 7 Mar 2019 11:47:43 +0100 Subject: [PATCH 038/146] Mark GC frame management functions as '@inline' --- src/gc.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 61b12f86..7d7ed8e1 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -126,7 +126,7 @@ end Allocates a new GC frame. """ -function new_gc_frame(size::UInt32)::GCFrame +@inline function new_gc_frame(size::UInt32)::GCFrame master_record = get_gc_master_record() # Return the root buffer tip: that's where the new GC frame starts. return unsafe_load(master_record.root_buffer_fingers, get_thread_id()) @@ -137,7 +137,7 @@ end Registers a GC frame with the garbage collector. """ -function push_gc_frame(gc_frame::GCFrame, size::UInt32) +@inline function push_gc_frame(gc_frame::GCFrame, size::UInt32) master_record = get_gc_master_record() # Update the root buffer tip. @@ -153,7 +153,7 @@ end Deregisters a GC frame. """ -function pop_gc_frame(gc_frame::GCFrame) +@inline function pop_gc_frame(gc_frame::GCFrame) master_record = get_gc_master_record() # Update the root buffer tip. From dcec58d56f9037be94d46c73bd9cc3251ea09ede Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 7 Mar 2019 12:11:37 +0100 Subject: [PATCH 039/146] Update 'get_thread_id' to take blocks into account --- src/gc.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gc.jl b/src/gc.jl index 7d7ed8e1..2aab537f 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -118,7 +118,7 @@ end # Gets the thread ID of the current thread. @inline function get_thread_id() - return threadIdx().x + return (blockIdx().x - 1) * blockDim().x + threadIdx().x end """ From f198cf86ec768fa1100a597ccf38f189d7a8daaf Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 7 Mar 2019 13:01:08 +0100 Subject: [PATCH 040/146] Introduce GC heap management data structures --- src/gc.jl | 99 +++++++++++++++++++++++++++++++++++++++++------ src/interrupts.jl | 32 +++++++++------ 2 files changed, 109 insertions(+), 22 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 2aab537f..edd94cd0 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -355,10 +355,36 @@ const initial_gc_heap_size = 16 * (1 << 20) # 256 roots. That's 2 KiB of roots per thread. const default_root_buffer_capacity = 256 -# Initializes GC memory and produces a master record. -function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_capacity::Integer = default_root_buffer_capacity)::GCMasterRecord - gc_memory_start_ptr = pointer(buffer, 1) - gc_memory_end_ptr = pointer(buffer, length(buffer)) +# A description of a region of memory that has been allocated to the GC heap. +struct GCHeapRegion + # A buffer that contains the GC region's bytes. + buffer::Array{UInt8, 1} + # A pointer to the first element in the region. + start::Ptr{UInt8} + # The region's size in bytes. + size::Csize_t +end + +GCHeapRegion(buffer::Array{UInt8, 1}) = GCHeapRegion(buffer, pointer(buffer, 1), Csize_t(length(buffer))) + +# A description of all memory that has been allocated to the GC heap. +struct GCHeapDescription + # A list of the set of regions that comprise the GC heap. + regions::Array{GCHeapRegion, 1} +end + +GCHeapDescription() = GCHeapDescription([]) + +# Initializes a GC heap and produces a master record. +function gc_init!( + heap::GCHeapDescription, + thread_count::Integer; + root_buffer_capacity::Integer = default_root_buffer_capacity)::GCMasterRecord + + master_region = heap.regions[1] + + gc_memory_start_ptr = master_region.start + gc_memory_end_ptr = master_region.start + master_region.size # Set up root buffers. fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count @@ -390,9 +416,58 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_cap return GCMasterRecord(global_arena, root_buffer_capacity, fingerbuf_ptr, rootbuf_ptr) end +# Tells if a GC heap contains a particular pointer. +function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T + for region in heap.regions + if pointer >= region.start && pointer < region.start + region.size + return true + end + end + return false +end + +# Expands the GC heap by allocating a region of memory and adding it to +# the list of allocated regions. `size` describes the amount of bytes to +# allocate. Returns the allocated region. +function expand!(heap::GCHeapDescription, size::Integer)::GCHeapRegion + buffer = alloc_shared_array((size,), UInt8(0)) + region = GCHeapRegion(buffer) + push!(heap.regions, region) + return region +end + +# Frees all memory allocated by a GC heap. +function free!(heap::GCHeapDescription) + for region in heap.regions + free_shared_array(region.buffer) + end +end + # Collects garbage. This function is designed to be called by # the host, not by the device. -function gc_collect_impl(master_record::GCMasterRecord) +function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) + + # The Julia CPU GC is precise and the information it uses for precise + # garbage collection is stored in memory that we should be able to access. + # However, the way the CPU GC stores field information is incredibly + # complicated and replicating that logic here would be a royal pain to + # implement and maintain. Ideally, the CPU GC would expose an interface that + # allows us to point to an object and ask the GC for all GC-tracked pointers + # it contains. Alas, no such luck: the CPU GC doesn't even have an internal + # function that does that. The CPU GC's logic for finding GC-tracked pointer + # fields is instead fused tightly with its 'mark' loop. + # + # To cope with this, we will simply implement a conservative GC: we precisely + # scan the roots for pointers into the GC heap. We then recursively mark blocks + # that are pointed to by such pointers as live and conservatively scan them for + # more pointers. + # + # A conservative GC is fairly simple: we maintain a worklist of pointers that + # are live and may need to be processed, as well as a set of pointers that are + # live and have already been processed. + live_pointers = Set{ObjectRef}() + live_worklist = [] + println("GC collections are not implemented yet.") end @@ -453,12 +528,14 @@ macro cuda_gc(ex...) quote GC.@preserve $(vars...) begin # Define a trivial buffer that contains the interrupt state. - local host_interrupt_array, device_interrupt_buffer = alloc_shared_array((1,), ready) + local host_interrupt_array = alloc_shared_array((1,), ready) + local device_interrupt_buffer = get_shared_device_buffer(host_interrupt_array) # Allocate a shared buffer for GC memory. local gc_memory_size = initial_gc_heap_size + sizeof(ObjectRef) * default_root_buffer_capacity * $(esc(thread_count)) - local host_gc_array, device_gc_buffer = alloc_shared_array((gc_memory_size,), UInt8(0)) - local master_record = gc_init(host_gc_array, $(esc(thread_count))) + local gc_heap = GCHeapDescription() + expand!(gc_heap, gc_memory_size) + local master_record = gc_init!(gc_heap, $(esc(thread_count))) # Define a kernel initialization function. local function kernel_init(kernel) @@ -488,7 +565,7 @@ macro cuda_gc(ex...) end local function handle_interrupt() - gc_collect_impl(master_record) + gc_collect_impl(master_record, gc_heap) end try @@ -502,8 +579,8 @@ macro cuda_gc(ex...) # Handle interrupts. handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream))) finally - free_shared_array(device_interrupt_buffer) - free_shared_array(device_gc_buffer) + free_shared_array(host_interrupt_array) + free!(gc_heap) end end end) diff --git a/src/interrupts.jl b/src/interrupts.jl index be60697e..7793b42d 100644 --- a/src/interrupts.jl +++ b/src/interrupts.jl @@ -11,9 +11,9 @@ export @cuda_interruptible, interrupt, interrupt_or_wait, wait_for_interrupt # Allocates an array of host memory that is page-locked and accessible # to the device. Maps the allocation into the CUDA address space. -# Returns a (host array, device buffer) pair. The former can be used by -# the host to access the array, the latter can be used by the device. -function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N} +# Returns a host array that can be turned into a device array by calling +# the `get_shared_device_buffer` function. +function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T)::Array{T, N} where {T, N} # Allocate memory that is accessible to both the host and the device. bytesize = prod(dims) * sizeof(T) ptr_ref = Ref{Ptr{Cvoid}}() @@ -22,20 +22,29 @@ function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N} (Ptr{Ptr{Cvoid}}, Csize_t), ptr_ref, bytesize) - device_buffer = CUDAdrv.Mem.Buffer(convert(CuPtr{T}, convert(Csize_t, ptr_ref[])), bytesize, CuCurrentContext()) - - # Wrap the memory in an array for the host. + # Wrap the memory in an array. host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false) # Initialize the array's contents. fill!(host_array, init) - return host_array, device_buffer + return host_array +end + +# Gets the device array that corresponds to a shared host array. +# NOTE: this function only works for arrays that were allocated by +# `alloc_shared_array`. It has undefined behavior for all other arrays. +function get_shared_device_buffer(shared_array::Array{T, N})::Mem.Buffer where {T, N} + bytesize = length(shared_array) * sizeof(T) + CUDAdrv.Mem.Buffer( + convert(CuPtr{T}, convert(Csize_t, pointer(shared_array, 1))), + bytesize, + CuCurrentContext()) end # Frees an array of host memory. -function free_shared_array(buffer::Mem.Buffer) - ptr = convert(Ptr{Cvoid}, convert(Csize_t, buffer.ptr)) +function free_shared_array(shared_array::Array{T, N}) where {T, N} + ptr = pointer(shared_array, 1) @apicall( :cuMemFreeHost, (Ptr{Cvoid},), @@ -233,7 +242,8 @@ macro cuda_interruptible(handler, ex...) quote GC.@preserve $(vars...) begin # Define a trivial buffer that contains the interrupt state. - local host_array, device_buffer = alloc_shared_array((1,), ready) + local host_array = alloc_shared_array((1,), ready) + local device_buffer = get_shared_device_buffer(host_array) try # Define a kernel initialization function that sets the @@ -261,7 +271,7 @@ macro cuda_interruptible(handler, ex...) # Handle interrupts. handle_interrupts($(esc(handler)), pointer(host_array, 1), $(esc(stream))) finally - free_shared_array(device_buffer) + free_shared_array(host_array) end end end) From d039839360027d71c7ef5302be0d5498bc25bcb5 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 7 Mar 2019 16:46:30 +0100 Subject: [PATCH 041/146] Implement the mark & sweep phases of the GC --- src/gc.jl | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 189 insertions(+), 14 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index edd94cd0..486a705a 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -26,6 +26,8 @@ export @cuda_gc, gc_malloc, gc_collect +import Base: length + # A data structure that precedes every chunk of memory that has been # allocated or put into the free list. struct GCAllocationRecord @@ -40,6 +42,11 @@ struct GCAllocationRecord next::Ptr{GCAllocationRecord} end +# Gets a pointer to the first byte of data managed by an allocation record. +function data_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8} + Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(GCAllocationRecord) +end + @generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name} index = Base.fieldindex(TBase, field_name) offset = Base.fieldoffset(TBase, index) @@ -83,6 +90,9 @@ struct GCMasterRecord # of roots per thread. root_buffer_capacity::UInt32 + # The number of threads. + thread_count::UInt32 + # A pointer to a list of root buffer pointers that point to the # end of the root buffer for every thread. root_buffer_fingers::Ptr{Ptr{ObjectRef}} @@ -92,6 +102,11 @@ struct GCMasterRecord root_buffers::Ptr{ObjectRef} end +# Iterates through all arena pointers stored in a GC master record. +@inline function iterate_arenas(fun::Function, master_record::GCMasterRecord) + fun(master_record.global_arena) +end + # Gets the global GC interrupt lock. @inline function get_interrupt_lock()::ReaderWriterLock return ReaderWriterLock(@cuda_global_ptr("gc_interrupt_lock", ReaderWriterLockState)) @@ -195,7 +210,7 @@ function gc_use_free_list_entry( # to create a new entry from any unused memory in the entry. # Compute the address to return. - data_address = Base.unsafe_convert(Ptr{UInt8}, entry) + sizeof(GCAllocationRecord) + data_address = data_pointer(entry) # Compute the end of the free memory chunk. end_address = data_address + entry_data.size @@ -207,11 +222,12 @@ function gc_use_free_list_entry( new_entry_address = new_data_address - sizeof(GCAllocationRecord) if new_entry_address < data_address + bytesize new_entry_address += gc_align + new_data_address += gc_align end # If we can place a new entry just past the allocation, then we should # by all means do so. - if new_entry_address + sizeof(GCAllocationRecord) < end_address + if new_data_address < end_address # Create a new free list entry. new_entry_size = Csize_t(end_address) - Csize_t(new_data_address) new_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, new_entry_address) @@ -329,11 +345,39 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} end # Alright, so that was a spectacular failure. Let's just throw an exception. - @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", bytesize) + @cuprintf("ERROR: Out of GPU GC memory (trying to allocate %i bytes)\n", bytesize) # throw(OutOfMemoryError()) return C_NULL end +# Tries to free a block of memory from a particular arena. `record_ptr` +# must point to a pointer to the GC allocation record to free. It will +# be updated to point to the next allocation. +# +# This function is designed to be called by the host: it does not +# turn off collections. It can be called by the device, but in that +# case it should be prefixed by the `@nocollect` macro followed by +# a write lock acquisition on the arena's lock. +function gc_free_local_impl( + arena::Ptr{GCArenaRecord}, + record_ptr::Ptr{Ptr{GCAllocationRecord}}) + + record = unsafe_load(record_ptr) + next_record_ptr = @get_field_pointer(record, :next) + free_list_head_ptr = @get_field_pointer(arena, :free_list_head) + + # Remove the record from the allocation list. + next_record = unsafe_load(next_record_ptr) + unsafe_store!(record_ptr, next_record) + + println("Freeing $(unsafe_load(record).size) bytes at $(data_pointer(record))") + + # Add the record to the free list and update its `next` pointer + # (but not in that order). + unsafe_store!(next_record_ptr, unsafe_load(free_list_head_ptr)) + unsafe_store!(free_list_head_ptr, record) +end + """ gc_collect() @@ -394,7 +438,7 @@ function gc_init!( # Populate the root buffer fingers. for i in 1:thread_count - unsafe_store!(fingerbuf_ptr, rootbuf_ptr + i * sizeof(ObjectRef) * root_buffer_capacity, i) + unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * root_buffer_capacity, i) end # Compute a pointer to the start of the heap. @@ -413,7 +457,7 @@ function gc_init!( global_arena, GCArenaRecord(0, first_entry_ptr, C_NULL)) - return GCMasterRecord(global_arena, root_buffer_capacity, fingerbuf_ptr, rootbuf_ptr) + return GCMasterRecord(global_arena, root_buffer_capacity, UInt32(thread_count), fingerbuf_ptr, rootbuf_ptr) end # Tells if a GC heap contains a particular pointer. @@ -443,10 +487,89 @@ function free!(heap::GCHeapDescription) end end -# Collects garbage. This function is designed to be called by -# the host, not by the device. -function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) +# A sorted list of all allocation records for allocated blocks. +# This data structure is primarily useful for rapidly mapping +# pointers to the blocks allocated blocks that contain them. +struct SortedAllocationList + # An array of pointers to allocation records. The pointers + # are all sorted. + records::Array{Ptr{GCAllocationRecord}, 1} +end + +length(alloc_list::SortedAllocationList) = length(alloc_list.records) + +# Gets a pointer to the allocation record that manages the memory +# pointed to by `pointer`. Returns a null pointer if there is no +# such record. +function get_record( + alloc_list::SortedAllocationList, + pointer::Ptr{T})::Ptr{GCAllocationRecord} where T + cast_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, pointer) + + # Deal with the most common cases quickly. + if length(alloc_list) == 0 || + pointer < data_pointer(alloc_list.records[1]) || + pointer > data_pointer(alloc_list.records[end]) + Base.unsafe_load(alloc_list.records[end]).size + + return C_NULL + end + + # To do this lookup quickly, we will do a binary search for the + # biggest allocation record pointer that is smaller than `pointer`. + range_start, range_end = 1, length(alloc_list) + while range_end - range_start > 1 + range_mid = div(range_start + range_end, 2) + mid_val = alloc_list.records[range_mid] + if mid_val > cast_ptr + range_end = range_mid + else + range_start = range_mid + end + end + + record = alloc_list.records[range_end] + if record >= cast_ptr + record = alloc_list.records[range_start] + end + + # Make sure that the pointer actually points to a region of memory + # that is managed by the candidate record we found. + record_data_pointer = data_pointer(record) + if cast_ptr >= record_data_pointer && cast_ptr < record_data_pointer + unsafe_load(record).size + return record + else + return C_NULL + end +end + +# Iterates through a linked list of allocation records and apply a function +# to every node in the linked list. The function is allowed to modify allocation +# records. +@inline function iterate_allocation_records(fun::Function, head::Ptr{GCAllocationRecord}) + while head != C_NULL + fun(head) + head = unsafe_load(head).next + end +end + +# Takes a GC master record and constructs a sorted allocation list +# based on it. +function sort_allocation_list(master_record::GCMasterRecord)::SortedAllocationList + records = [] + iterate_arenas(master_record) do arena + allocation_list_head = unsafe_load(arena).allocation_list_head + iterate_allocation_records(allocation_list_head) do record + push!(records, record) + end + end + sort!(records) + return SortedAllocationList(records) +end + +# Collects garbage. This function is designed to be called by the host, +# not by the device. +function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) # The Julia CPU GC is precise and the information it uses for precise # garbage collection is stored in memory that we should be able to access. # However, the way the CPU GC stores field information is incredibly @@ -457,18 +580,70 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) # function that does that. The CPU GC's logic for finding GC-tracked pointer # fields is instead fused tightly with its 'mark' loop. # - # To cope with this, we will simply implement a conservative GC: we precisely + # To cope with this, we will simply implement a semi-conservative GC: we precisely # scan the roots for pointers into the GC heap. We then recursively mark blocks # that are pointed to by such pointers as live and conservatively scan them for # more pointers. # - # A conservative GC is fairly simple: we maintain a worklist of pointers that - # are live and may need to be processed, as well as a set of pointers that are + # Our mark phase is fairly simple: we maintain a worklist of pointers that + # are live and may need to be processed, as well as a set of blocks that are # live and have already been processed. - live_pointers = Set{ObjectRef}() - live_worklist = [] + live_blocks = Set{Ptr{GCAllocationRecord}}() + live_worklist = Ptr{ObjectRef}[] + + # Get a sorted allocation list, which will allow us to classify live pointers quickly. + alloc_list = sort_allocation_list(master_record) - println("GC collections are not implemented yet.") + # Add all roots to the worklist. + for i in 1:(master_record.root_buffer_capacity * master_record.thread_count) + root = unsafe_load(master_record.root_buffers, i) + if root != C_NULL + push!(live_worklist, root) + end + end + + # Now process all live pointers until we reach a fixpoint. + while !isempty(live_worklist) + # Pop a pointer from the worklist. + object_ref = pop!(live_worklist) + # Get the block for that pointer. + record = get_record(alloc_list, object_ref) + # Make sure that we haven't visited the block yet. + if record != C_NULL && !(record in live_blocks) + # Mark the block as live. + push!(live_blocks, record) + # Add all pointer-sized, aligned values to the live pointer worklist. + block_pointer = data_pointer(record) + block_size = unsafe_load(record).size + for i in 0:sizeof(ObjectRef):(block_size - 1) + push!(live_worklist, Base.unsafe_convert(ObjectRef, block_pointer + i)) + end + end + end + + # We're done with the mark phase! Time to proceed to the sweep phase. + # The first thing we'll do is iterate through every arena's allocation list and + # free dead blocks. + iterate_arenas(master_record) do arena + record_ptr = @get_field_pointer(arena, :allocation_list_head) + while true + record = unsafe_load(record_ptr) + if record == C_NULL + # We've reached the end of the list. + break + end + + if record in live_blocks + # We found a live block. Proceed to the next block. + record_ptr = @get_field_pointer(record, :next) + else + # We found a dead block. Release it. Don't proceed to the + # next block because the current block will change in the + # next iteration of this loop. + gc_free_local_impl(arena, record_ptr) + end + end + end end # Examines a keyword argument list and gets either the value From 7358f9cbee01e345a4729220f82fe02d67f22b85 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 7 Mar 2019 19:00:19 +0100 Subject: [PATCH 042/146] Implement a free list compaction and extra memory allocation scheme --- src/gc.jl | 127 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 114 insertions(+), 13 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 486a705a..62e484ba 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -42,11 +42,6 @@ struct GCAllocationRecord next::Ptr{GCAllocationRecord} end -# Gets a pointer to the first byte of data managed by an allocation record. -function data_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8} - Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(GCAllocationRecord) -end - @generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name} index = Base.fieldindex(TBase, field_name) offset = Base.fieldoffset(TBase, index) @@ -59,6 +54,16 @@ macro get_field_pointer(base_pointer, field_name) :(get_field_pointer_impl($(esc(base_pointer)), Val($field_name))) end +# Gets a pointer to the first byte of data managed by an allocation record. +function data_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8} + Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(GCAllocationRecord) +end + +# Gets a pointer to the first byte of data no longer managed by an allocation record. +function data_end_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8} + data_pointer(record) + unsafe_load(@get_field_pointer(record, :size)) +end + # A data structure that describes a single GC "arena", i.e., # a section of the heap that is managed by the GC. Every arena # has its own free list and allocation list. @@ -350,6 +355,16 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} return C_NULL end +# Zero-fills a range of memory. +function zero_fill!(start_ptr::Ptr{UInt8}, size::Csize_t) + ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), start_ptr, 0, size) +end + +# Zero-fills a range of memory. +function zero_fill!(start_ptr::Ptr{UInt8}, end_ptr::Ptr{UInt8}) + zero_fill!(start_ptr, Csize_t(end_ptr) - Csize_t(start_ptr)) +end + # Tries to free a block of memory from a particular arena. `record_ptr` # must point to a pointer to the GC allocation record to free. It will # be updated to point to the next allocation. @@ -370,12 +385,13 @@ function gc_free_local_impl( next_record = unsafe_load(next_record_ptr) unsafe_store!(record_ptr, next_record) - println("Freeing $(unsafe_load(record).size) bytes at $(data_pointer(record))") - # Add the record to the free list and update its `next` pointer # (but not in that order). unsafe_store!(next_record_ptr, unsafe_load(free_list_head_ptr)) unsafe_store!(free_list_head_ptr, record) + + # Zero-fill the newly freed block of memory. + zero_fill!(data_pointer(record), unsafe_load(@get_field_pointer(record, :size))) end """ @@ -399,6 +415,14 @@ const initial_gc_heap_size = 16 * (1 << 20) # 256 roots. That's 2 KiB of roots per thread. const default_root_buffer_capacity = 256 +# The point at which an arena is deemed to be starving, i.e., +# it no longer contains enough memory to perform basic allocations. +# If an arena's free byte count stays below the arena starvation +# size after a collection phase, the collector will allocate additional +# memory to the arena such that it is no longer starving. +# The arena starvation limit is currently set to 4 MiB. +const arena_starvation_limit = 4 * (1 << 20) + # A description of a region of memory that has been allocated to the GC heap. struct GCHeapRegion # A buffer that contains the GC region's bytes. @@ -443,13 +467,11 @@ function gc_init!( # Compute a pointer to the start of the heap. heap_start_ptr = rootbuf_ptr + rootbuf_bytesize - global_arena_size = Csize_t(gc_memory_end_ptr) - Csize_t(heap_start_ptr) - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord) # Create a single free list entry. - first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, heap_start_ptr + sizeof(GCArenaRecord)) - unsafe_store!( - first_entry_ptr, - GCAllocationRecord(global_arena_size, C_NULL)) + first_entry_ptr = make_gc_block!( + heap_start_ptr + sizeof(GCArenaRecord), + Csize_t(gc_memory_end_ptr) - Csize_t(heap_start_ptr) - sizeof(GCArenaRecord)) # Set up the main GC data structure. global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, heap_start_ptr) @@ -460,6 +482,18 @@ function gc_init!( return GCMasterRecord(global_arena, root_buffer_capacity, UInt32(thread_count), fingerbuf_ptr, rootbuf_ptr) end +# Takes a zero-filled region of memory and turns it into a block +# managed by the GC, prefixed with an allocation record. +function make_gc_block!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{GCAllocationRecord} where T + entry = Base.unsafe_convert(Ptr{GCAllocationRecord}, start_ptr) + unsafe_store!( + entry, + GCAllocationRecord( + Csize_t(start_ptr + size) - Csize_t(data_pointer(entry)), + C_NULL)) + return entry +end + # Tells if a GC heap contains a particular pointer. function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T for region in heap.regions @@ -567,6 +601,56 @@ function sort_allocation_list(master_record::GCMasterRecord)::SortedAllocationLi return SortedAllocationList(records) end +# Compact a GC arena's free list. This function will +# 1. merge adjancent free blocks, and +# 2. reorder free blocks to put small blocks at the front +# of the free list, +# 3. tally the total number of free bytes and return that number. +function gc_compact_free_list(arena::Ptr{GCArenaRecord})::Csize_t + # Let's start by creating a list of all free list records. + records = Ptr{GCAllocationRecord}[] + free_list_head = unsafe_load(arena).free_list_head + iterate_allocation_records(free_list_head) do record + push!(records, record) + end + + # We now sort those records and loop through the sorted list, + # merging free list entries as we go along. + sort!(records) + + i = 1 + while i < length(records) + first_record = records[i] + second_record = records[i + 1] + if data_end_pointer(first_record) == Base.unsafe_convert(Ptr{UInt8}, second_record) + # We found two adjacent free list entries. Expand the first + # record's size to encompass both entries, zero-fill the second + # record's header and delete it from the list of records. + new_size = Csize_t(data_end_pointer(second_record)) - Csize_t(data_pointer(first_record)) + zero_fill!(data_end_pointer(first_record), data_pointer(second_record)) + unsafe_store!(@get_field_pointer(first_record, :size), new_size) + deleteat!(records, i + 1) + else + i += 1 + end + end + + # Now sort the records based on size. Put the smallest records first to + # discourage fragmentation. + sort!(records; lt = (x, y) -> unsafe_load(x).size < unsafe_load(y).size) + + # Reconstruct the free list as a linked list. + prev_record_ptr = @get_field_pointer(arena, :free_list_head) + for record in records + unsafe_store!(prev_record_ptr, record) + prev_record_ptr = @get_field_pointer(record, :next) + end + unsafe_store!(prev_record_ptr, C_NULL) + + # Compute the total number of free bytes. + return sum(record -> unsafe_load(record).size, records) +end + # Collects garbage. This function is designed to be called by the host, # not by the device. function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) @@ -623,7 +707,8 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) # We're done with the mark phase! Time to proceed to the sweep phase. # The first thing we'll do is iterate through every arena's allocation list and - # free dead blocks. + # free dead blocks. Next, we will compact and reorder free lists to combat + # fragmentation. iterate_arenas(master_record) do arena record_ptr = @get_field_pointer(arena, :allocation_list_head) while true @@ -643,6 +728,22 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) gc_free_local_impl(arena, record_ptr) end end + + # Compact the free list. + free_memory = gc_compact_free_list(arena) + + # If the amount of free memory in the arena is below the starvation + # limit then we'll expand the GC heap and add the additional memory + # to the arena's free list. + if free_memory < arena_starvation_limit + region = expand!(heap, arena_starvation_limit) + extra_record = make_gc_block!(region.start, region.size) + last_free_list_ptr = @get_field_pointer(arena, :free_list_head) + iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record + last_free_list_ptr = @get_field_pointer(record, :next) + end + unsafe_store!(last_free_list_ptr, extra_record) + end end end From 457006a4c79a9c546f7c25e95d4533d30256f04c Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 8 Mar 2019 11:40:05 +0100 Subject: [PATCH 043/146] Update GC docs --- src/gc.jl | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 62e484ba..73dbfa0c 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -1,9 +1,6 @@ # This file contains a GC implementation for CUDAnative kernels. -# -# CURRENT STATE OF THE GC -# -# Simple memory allocation is underway. Memory allocation currently -# uses a simple free-list. +# The sections below contain some basic info on how the garbage +# collector works. # # MEMORY ALLOCATION # @@ -12,12 +9,18 @@ # the allocator also maintains a list of all allocated blocks, so # the collector knows which blocks it can free. # -# END GOAL +# GARBAGE COLLECTION +# +# The garbage collector itself is a semi-conservative, non-moving, +# mark-and-sweep GC that runs on the host. The device may trigger +# the GC via an interrupt. +# +# The GC is semi-conservative in the sense that its set of roots +# is precise but objects are scanned in an imprecise way. # -# The CUDAnative GC is a precise, non-moving, mark-and-sweep GC that runs -# on the host. The device may trigger the GC via an interrupt. +# MISCELLANEOUS # -# Some GPU-related GC implementation details: +# Some miscellaneous GPU-related GC implementation details: # # * GC memory is shared by the host and device. # * Every thread gets a fixed region of memory for storing GC roots in. From 0666d093de7cf4f0a2d296ff972dc138afd17720 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 8 Mar 2019 11:50:42 +0100 Subject: [PATCH 044/146] Modify GC lock acquisition scheme slightly --- src/gc.jl | 55 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 73dbfa0c..bfc54ba8 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -306,26 +306,24 @@ end # Returns a null pointer if no sufficiently large chunk of # memory can be found. function gc_malloc_local(arena::Ptr{GCArenaRecord}, bytesize::Csize_t)::Ptr{UInt8} - # Disable collections and acquire the arena's lock. - @nocollect begin - arena_lock = ReaderWriterLock(@get_field_pointer(arena, :lock_state)) - result_ptr = writer_locked(arena_lock) do - # Allocate a suitable region of memory. - free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}} - allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}} - gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize) - end + # Acquire the arena's lock. + arena_lock = ReaderWriterLock(@get_field_pointer(arena, :lock_state)) + result_ptr = writer_locked(arena_lock) do + # Allocate a suitable region of memory. + free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}} + allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}} + gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize) + end - # If the resulting pointer is non-null, then we'll write it to a temporary GC frame. - # Our reasoning for doing this is that doing so ensures that the allocated memory - # won't get collected by the GC before the caller has a chance to add it to its - # own GC frame. - if result_ptr != Base.unsafe_convert(Ptr{UInt8}, C_NULL) - gc_frame = new_gc_frame(UInt32(1)) - unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, result_ptr)) - end - return result_ptr + # If the resulting pointer is non-null, then we'll write it to a temporary GC frame. + # Our reasoning for doing this is that doing so ensures that the allocated memory + # won't get collected by the GC before the caller has a chance to add it to its + # own GC frame. + if result_ptr != Base.unsafe_convert(Ptr{UInt8}, C_NULL) + gc_frame = new_gc_frame(UInt32(1)) + unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, result_ptr)) end + return result_ptr end """ @@ -338,16 +336,18 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} master_record = get_gc_master_record() # Try to malloc the object without host intervention. - ptr = gc_malloc_local(master_record.global_arena, bytesize) + ptr = @nocollect gc_malloc_local(master_record.global_arena, bytesize) if ptr != C_NULL return ptr end # We're out of memory. Ask the host to step in. - gc_collect() + ptr = writer_locked(get_interrupt_lock()) do + gc_collect_impl() - # Try to malloc again. - ptr = gc_malloc_local(master_record.global_arena, bytesize) + # Try to malloc again. + gc_malloc_local(master_record.global_arena, bytesize) + end if ptr != C_NULL return ptr end @@ -397,6 +397,12 @@ function gc_free_local_impl( zero_fill!(data_pointer(record), unsafe_load(@get_field_pointer(record, :size))) end +# Like 'gc_collect', but does not acquire the interrupt lock. +function gc_collect_impl() + interrupt_or_wait() + threadfence_system() +end + """ gc_collect() @@ -404,10 +410,7 @@ Triggers a garbage collection phase. This function is designed to be called by the device rather than by the host. """ function gc_collect() - writer_locked(get_interrupt_lock()) do - interrupt_or_wait() - threadfence_system() - end + writer_locked(gc_collect_impl, get_interrupt_lock()) end # The initial size of the GC heap, currently 16 MiB. From 0f1ccc6a0fa777e0a28228f9252881dfed51e701 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 8 Mar 2019 11:56:55 +0100 Subject: [PATCH 045/146] Avoid overly frequent garbage collections --- src/gc.jl | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index bfc54ba8..29726824 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -341,12 +341,25 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} return ptr end - # We're out of memory. Ask the host to step in. + # We're out of memory, which means that we need the garbage collector + # to step in. Acquire the interrupt lock. ptr = writer_locked(get_interrupt_lock()) do - gc_collect_impl() - - # Try to malloc again. - gc_malloc_local(master_record.global_arena, bytesize) + # Try to allocate memory again. This is bound to fail for the + # first thread that acquires the interrupt lock, but it is quite + # likely to succeed if we are *not* in the first thread that + # acquired the garbage collector lock. + ptr2 = gc_malloc_local(master_record.global_arena, bytesize) + + if ptr2 == C_NULL + # We are either the first thread to acquire the interrupt lock + # or the additional memory produced by a previous collection has + # already been exhausted. Trigger the garbage collector. + gc_collect_impl() + + # Try to malloc again. + ptr2 = gc_malloc_local(master_record.global_arena, bytesize) + end + ptr2 end if ptr != C_NULL return ptr From e48677eca8d5181af904e6ff5719fb38a6bba024 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 8 Mar 2019 12:01:44 +0100 Subject: [PATCH 046/146] Document free list compaction --- src/gc.jl | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 29726824..e861d714 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -18,6 +18,15 @@ # The GC is semi-conservative in the sense that its set of roots # is precise but objects are scanned in an imprecise way. # +# After every garbage collection, the GC will compact free lists: +# adjacent free list block will be merged and the free list will +# be sorted based on block sizes to combat memory fragmentation. +# +# If a free list is deemed to be "starving" after a collection, i.e., +# its total amount of free bytes has dropped below some threshold, +# then a fresh chunk of GC-managed memory is allocated and added to +# the free list. +# # MISCELLANEOUS # # Some miscellaneous GPU-related GC implementation details: @@ -437,10 +446,10 @@ const default_root_buffer_capacity = 256 # The point at which an arena is deemed to be starving, i.e., # it no longer contains enough memory to perform basic allocations. # If an arena's free byte count stays below the arena starvation -# size after a collection phase, the collector will allocate additional -# memory to the arena such that it is no longer starving. -# The arena starvation limit is currently set to 4 MiB. -const arena_starvation_limit = 4 * (1 << 20) +# threshold after a collection phase, the collector will allocate +# additional memory to the arena such that it is no longer starving. +# The arena starvation threshold is currently set to 4 MiB. +const arena_starvation_threshold = 4 * (1 << 20) # A description of a region of memory that has been allocated to the GC heap. struct GCHeapRegion @@ -754,8 +763,8 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) # If the amount of free memory in the arena is below the starvation # limit then we'll expand the GC heap and add the additional memory # to the arena's free list. - if free_memory < arena_starvation_limit - region = expand!(heap, arena_starvation_limit) + if free_memory < arena_starvation_threshold + region = expand!(heap, arena_starvation_threshold) extra_record = make_gc_block!(region.start, region.size) last_free_list_ptr = @get_field_pointer(arena, :free_list_head) iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record From 3ec9f48f22b21e9ba2a0062b4cc596dc41f50afd Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 8 Mar 2019 12:22:45 +0100 Subject: [PATCH 047/146] Reserve a buffer for safepoints --- src/gc.jl | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index e861d714..c6bc3d0c 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -12,8 +12,8 @@ # GARBAGE COLLECTION # # The garbage collector itself is a semi-conservative, non-moving, -# mark-and-sweep GC that runs on the host. The device may trigger -# the GC via an interrupt. +# mark-and-sweep, stop-the-world GC that runs on the host. +# The device may trigger the GC via an interrupt. # # The GC is semi-conservative in the sense that its set of roots # is precise but objects are scanned in an imprecise way. @@ -27,6 +27,13 @@ # then a fresh chunk of GC-managed memory is allocated and added to # the free list. # +# SAFEPOINTS +# +# Every warp gets a flag that tells if that warp is in a safepoint. +# When a collection is triggered, the collector waits for every warp +# to reach a safepoint. The warps indicate that they have reached a +# safepoint by setting the flag. +# # MISCELLANEOUS # # Some miscellaneous GPU-related GC implementation details: @@ -103,12 +110,19 @@ struct GCMasterRecord # A pointer to the global GC arena. global_arena::Ptr{GCArenaRecord} + # The number of warps. + warp_count::UInt32 + + # The number of threads. + thread_count::UInt32 + # The maximum size of a GC root buffer, i.e., the maximum number # of roots per thread. root_buffer_capacity::UInt32 - # The number of threads. - thread_count::UInt32 + # A pointer to a list of safepoint flags. Every warp has its + # own flag. + safepoint_flags::Ptr{UInt8} # A pointer to a list of root buffer pointers that point to the # end of the root buffer for every thread. @@ -475,16 +489,25 @@ GCHeapDescription() = GCHeapDescription([]) function gc_init!( heap::GCHeapDescription, thread_count::Integer; + warp_count::Union{Integer, Nothing} = nothing, root_buffer_capacity::Integer = default_root_buffer_capacity)::GCMasterRecord + if warp_count == nothing + warp_count = thread_count / CUDAdrv.warpsize(device()) + end + master_region = heap.regions[1] gc_memory_start_ptr = master_region.start gc_memory_end_ptr = master_region.start + master_region.size + # Set up the safepoint flag buffer. + safepoint_bytesize = sizeof(UInt8) * warp_count + safepoint_ptr = Base.unsafe_convert(Ptr{UInt8}, gc_memory_start_ptr) + # Set up root buffers. fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count - fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, gc_memory_start_ptr) + fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, safepoint_ptr + fingerbuf_bytesize) rootbuf_bytesize = sizeof(ObjectRef) * root_buffer_capacity * thread_count rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, fingerbuf_ptr + fingerbuf_bytesize) @@ -507,7 +530,14 @@ function gc_init!( global_arena, GCArenaRecord(0, first_entry_ptr, C_NULL)) - return GCMasterRecord(global_arena, root_buffer_capacity, UInt32(thread_count), fingerbuf_ptr, rootbuf_ptr) + return GCMasterRecord( + global_arena, + UInt32(warp_count), + UInt32(thread_count), + root_buffer_capacity, + safepoint_ptr, + fingerbuf_ptr, + rootbuf_ptr) end # Takes a zero-filled region of memory and turns it into a block From ef90bb438ac5c1decb76e5fcd23fddc612a7cb7a Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 8 Mar 2019 12:34:01 +0100 Subject: [PATCH 048/146] Implement a safepoint function --- src/gc.jl | 28 ++++++++++++++++++++++++---- src/interrupts.jl | 17 +++++++++++++++-- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index c6bc3d0c..bd991a3b 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -43,7 +43,7 @@ # * When the device runs out of GC memory, it requests an interrupt # to mark and sweep. -export @cuda_gc, gc_malloc, gc_collect +export @cuda_gc, gc_malloc, gc_collect, gc_safepoint import Base: length @@ -103,6 +103,9 @@ const ObjectRef = Ptr{Nothing} # A GC frame is just a pointer to an array of Julia objects. const GCFrame = Ptr{ObjectRef} +# The type of a safepoint flag. +const SafepointFlag = UInt32 + # A data structure that contains global GC info. This data # structure is designed to be immutable: it should not be changed # once the host has set it up. @@ -122,7 +125,7 @@ struct GCMasterRecord # A pointer to a list of safepoint flags. Every warp has its # own flag. - safepoint_flags::Ptr{UInt8} + safepoint_flags::Ptr{SafepointFlag} # A pointer to a list of root buffer pointers that point to the # end of the root buffer for every thread. @@ -210,6 +213,23 @@ Deregisters a GC frame. return end +""" + gc_safepoint() + +Signals that this warp has reached a GC safepoint. +""" +@inline function gc_safepoint() + master_record = get_gc_master_record() + warp_id = div(get_thread_id() - 1, master_record.warp_count) + 1 + safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointFlag) * warp_id + + wait_for_interrupt() do + volatile_store!(safepoint_flag_ptr, SafepointFlag(1)) + end + + return +end + const gc_align = Csize_t(16) # Aligns a pointer to an alignment boundary. @@ -502,8 +522,8 @@ function gc_init!( gc_memory_end_ptr = master_region.start + master_region.size # Set up the safepoint flag buffer. - safepoint_bytesize = sizeof(UInt8) * warp_count - safepoint_ptr = Base.unsafe_convert(Ptr{UInt8}, gc_memory_start_ptr) + safepoint_bytesize = sizeof(SafepointFlag) * warp_count + safepoint_ptr = Base.unsafe_convert(Ptr{SafepointFlag}, gc_memory_start_ptr) # Set up root buffers. fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count diff --git a/src/interrupts.jl b/src/interrupts.jl index 7793b42d..83fe13d5 100644 --- a/src/interrupts.jl +++ b/src/interrupts.jl @@ -141,6 +141,20 @@ function interrupt_or_wait()::Bool return prev_state == ready end +""" + wait_for_interrupt(fun::Function) + +Waits for the current interrupt to finish, if an interrupt is +currently running. A function is repeatedly executed until the +interrupt finishes. +""" +function wait_for_interrupt(fun::Function) + state_ptr = get_interrupt_pointer() + while volatile_load(state_ptr) == processing + fun() + end +end + """ wait_for_interrupt() @@ -148,8 +162,7 @@ Waits for the current interrupt to finish, if an interrupt is currently running. """ function wait_for_interrupt() - state_ptr = get_interrupt_pointer() - while volatile_load(state_ptr) == processing + wait_for_interrupt() do end end From a76a568aebb021d259678352a769e5936cdc9abe Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 8 Mar 2019 12:45:16 +0100 Subject: [PATCH 049/146] Put safepoint flag values in an enum --- src/gc.jl | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index bd991a3b..a523e371 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -104,7 +104,18 @@ const ObjectRef = Ptr{Nothing} const GCFrame = Ptr{ObjectRef} # The type of a safepoint flag. -const SafepointFlag = UInt32 +@enum SafepointFlag::UInt32 begin + # Indicates that a warp is not in a safepoint. + not_in_safepoint = 0 + # Indicates that a warp is in a safepoint. This + # flag will be reset to `not_in_safepoint` by the + # collector on the next collecotr. + in_safepoint = 1 + # Indicates that a warp is in a perma-safepoint: + # the collector will not try to set this type + # of safepoint back to `not_in_safepoint`. + in_perma_safepoint = 2 +end # A data structure that contains global GC info. This data # structure is designed to be immutable: it should not be changed @@ -224,7 +235,7 @@ Signals that this warp has reached a GC safepoint. safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointFlag) * warp_id wait_for_interrupt() do - volatile_store!(safepoint_flag_ptr, SafepointFlag(1)) + volatile_store!(safepoint_flag_ptr, in_safepoint) end return From 23e128c2e30275ac71a6d871f025f71d362d9f13 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 8 Mar 2019 14:08:12 +0100 Subject: [PATCH 050/146] Implement stop-the-world part of the GC --- src/gc.jl | 68 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index a523e371..a3e7d4b6 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -103,8 +103,8 @@ const ObjectRef = Ptr{Nothing} # A GC frame is just a pointer to an array of Julia objects. const GCFrame = Ptr{ObjectRef} -# The type of a safepoint flag. -@enum SafepointFlag::UInt32 begin +# The states a safepoint flag can have. +@enum SafepointState::UInt32 begin # Indicates that a warp is not in a safepoint. not_in_safepoint = 0 # Indicates that a warp is in a safepoint. This @@ -136,7 +136,7 @@ struct GCMasterRecord # A pointer to a list of safepoint flags. Every warp has its # own flag. - safepoint_flags::Ptr{SafepointFlag} + safepoint_flags::Ptr{SafepointState} # A pointer to a list of root buffer pointers that point to the # end of the root buffer for every thread. @@ -181,6 +181,11 @@ end return (blockIdx().x - 1) * blockDim().x + threadIdx().x end +# Gets the warp ID of the current thread. +@inline function get_warp_id() + return div(get_thread_id() - 1, warpsize()) + 1 +end + """ new_gc_frame(size::UInt32)::GCFrame @@ -229,18 +234,34 @@ end Signals that this warp has reached a GC safepoint. """ -@inline function gc_safepoint() - master_record = get_gc_master_record() - warp_id = div(get_thread_id() - 1, master_record.warp_count) + 1 - safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointFlag) * warp_id - +function gc_safepoint() wait_for_interrupt() do - volatile_store!(safepoint_flag_ptr, in_safepoint) + gc_set_safepoint_flag(in_safepoint) end + return +end +# Sets this warp's safepoint flag to a particular state. +function gc_set_safepoint_flag(value::SafepointState) + master_record = get_gc_master_record() + warp_id = get_warp_id() + safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointState) * (warp_id - 1) + volatile_store!(safepoint_flag_ptr, value) return end +# Marks a region as a perma-safepoint: the entire region +# is a safepoint. Note that perma-safepoints are not allowed +# to include non-perma-safepoints. +macro perma_safepoint(expr) + quote + gc_set_safepoint_flag(in_perma_safepoint) + local result = $(esc(expr)) + gc_set_safepoint_flag(not_in_safepoint) + result + end +end + const gc_align = Csize_t(16) # Aligns a pointer to an alignment boundary. @@ -390,14 +411,14 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} master_record = get_gc_master_record() # Try to malloc the object without host intervention. - ptr = @nocollect gc_malloc_local(master_record.global_arena, bytesize) + ptr = @perma_safepoint @nocollect gc_malloc_local(master_record.global_arena, bytesize) if ptr != C_NULL return ptr end # We're out of memory, which means that we need the garbage collector - # to step in. Acquire the interrupt lock. - ptr = writer_locked(get_interrupt_lock()) do + # to step in. Set a perma-safepoint and acquire the interrupt lock. + ptr = @perma_safepoint writer_locked(get_interrupt_lock()) do # Try to allocate memory again. This is bound to fail for the # first thread that acquires the interrupt lock, but it is quite # likely to succeed if we are *not* in the first thread that @@ -533,8 +554,8 @@ function gc_init!( gc_memory_end_ptr = master_region.start + master_region.size # Set up the safepoint flag buffer. - safepoint_bytesize = sizeof(SafepointFlag) * warp_count - safepoint_ptr = Base.unsafe_convert(Ptr{SafepointFlag}, gc_memory_start_ptr) + safepoint_bytesize = sizeof(SafepointState) * warp_count + safepoint_ptr = Base.unsafe_convert(Ptr{SafepointState}, gc_memory_start_ptr) # Set up root buffers. fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count @@ -743,6 +764,25 @@ end # Collects garbage. This function is designed to be called by the host, # not by the device. function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) + # First off, we have to wait for all warps to reach a safepoint. Clear + # safepoint flags and wait for warps to set them again. + for i in 0:(master_record.warp_count - 1) + atomic_compare_exchange!( + master_record.safepoint_flags + i * sizeof(SafepointState), + in_safepoint, + not_in_safepoint) + end + safepoint_count = 0 + while safepoint_count != master_record.warp_count + safepoint_count = 0 + for i in 0:(master_record.warp_count - 1) + state = volatile_load(master_record.safepoint_flags + i * sizeof(SafepointState)) + if state != not_in_safepoint + safepoint_count += 1 + end + end + end + # The Julia CPU GC is precise and the information it uses for precise # garbage collection is stored in memory that we should be able to access. # However, the way the CPU GC stores field information is incredibly From 0875425b9113e590cbfc40b28ab06e12feabf630 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 8 Mar 2019 16:00:34 +0100 Subject: [PATCH 051/146] Automatically insert safepoints --- src/compiler/optim.jl | 61 +++++++++++++++++++++++++++++++++++++++++++ src/device/runtime.jl | 3 +++ 2 files changed, 64 insertions(+) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index f798f96e..f29a2a0c 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -71,6 +71,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int initialize!(pm) # lower intrinsics if ctx.gc + add!(pm, FunctionPass("InsertSafepointsGPUGC", insert_safepoints_gpugc!)) add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!)) else add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!)) @@ -586,6 +587,66 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) return changed end +# Tells if a function manages a GC frame. +function has_gc_frame(fun::LLVM.Function) + for insn in instructions(entry(fun)) + if isa(insn, LLVM.CallInst) + callee = called_value(insn) + if isa(callee, LLVM.Function) && LLVM.name(callee) == "julia.new_gc_frame" + return true + end + end + end + return false +end + +# Tells if an instruction is a call to a non-intrinsic callee. +function is_non_intrinsic_call(instruction::LLVM.Instruction) + if isa(instruction, LLVM.CallInst) + callee = called_value(instruction) + if isa(callee, LLVM.Function) + callee_name = LLVM.name(callee) + return !startswith(callee_name, "julia.") && !startswith(callee_name, "llvm.") + else + return true + end + else + return false + end +end + +""" + insert_safepoints_gpugc!(fun::LLVM.Function) + +An LLVM pass that inserts GC safepoints in such a way that threads +reach a safepoint after a reasonable amount of time. +""" +function insert_safepoints_gpugc!(fun::LLVM.Function) + # Insert a safepoint before every function call, but only for + # functions that manage a GC frame. + # + # TODO: also insert safepoints on loop back-edges? This is what people + # usually do, but it requires nontrivial IR analyses that the LLVM C + # API doesn't expose. + + if has_gc_frame(fun) + let builder = Builder(JuliaContext()) + for block in blocks(fun) + for instruction in instructions(block) + if is_non_intrinsic_call(instruction) + # Insert a safepoint just before the call. + position!(builder, instruction) + debuglocation!(builder, instruction) + call!(builder, Runtime.get(:gc_safepoint), LLVM.Value[]) + end + end + end + dispose(builder) + end + end + return true +end + # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible. # # this assumes and checks that the TLS is unused, which should be the case for most GPU code diff --git a/src/device/runtime.jl b/src/device/runtime.jl index e456c356..1c492369 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -261,4 +261,7 @@ compile( () -> convert(LLVMType, Cvoid), () -> [T_pprjlvalue()]) +# Also import the safepoint function. +compile(CUDAnative.gc_safepoint, Cvoid, ()) + end From 3ad1ee8d07409c9b0cea92e9c40550542b7d0e14 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 8 Mar 2019 16:03:41 +0100 Subject: [PATCH 052/146] Update GC example --- examples/{gc-malloc.jl => gc.jl} | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) rename examples/{gc-malloc.jl => gc.jl} (67%) diff --git a/examples/gc-malloc.jl b/examples/gc.jl similarity index 67% rename from examples/gc-malloc.jl rename to examples/gc.jl index 597ed2ae..211a2fb4 100644 --- a/examples/gc-malloc.jl +++ b/examples/gc.jl @@ -1,18 +1,30 @@ -using CUDAdrv, CUDAnative +using CUDAdrv, CUDAnative, LLVM +using InteractiveUtils using Test +mutable struct TempStruct + data::Float32 +end + +@noinline function escape(val) + Base.pointer_from_objref(val) +end + # Define a kernel that copies values using a temporary buffer. function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32}) i = (blockIdx().x-1) * blockDim().x + threadIdx().x - buffer = Base.unsafe_convert(Ptr{Float32}, gc_malloc(sizeof(Float32) * Csize_t(16))) - unsafe_store!(buffer, unsafe_load(a, i), i % 13) - unsafe_store!(b, unsafe_load(buffer, i % 13), i) + for j in 1:256 + # Allocate a mutable struct and make sure it ends up on the GC heap. + temp = TempStruct(unsafe_load(a, i)) + escape(temp) + unsafe_store!(b, temp.data, i) + end return end -thread_count = 64 +thread_count = 256 # Allocate two arrays. source_array = Mem.alloc(Float32, thread_count) From a0fbee87c515ad871d64a5e594e2e7681b8051e1 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 11 Mar 2019 15:18:27 +0100 Subject: [PATCH 053/146] Add a binary search tree example --- examples/binary-tree.jl | 158 ++++++++++++++++++++++++++++++++++++++++ examples/gc.jl | 3 +- 2 files changed, 159 insertions(+), 2 deletions(-) create mode 100644 examples/binary-tree.jl diff --git a/examples/binary-tree.jl b/examples/binary-tree.jl new file mode 100644 index 00000000..5fb0c19a --- /dev/null +++ b/examples/binary-tree.jl @@ -0,0 +1,158 @@ +using CUDAdrv, CUDAnative +using Random, Test +import Base: haskey, insert! + +# This example defines a kernel that constructs a binary search +# tree for a set of numbers and then proceeds to test membership +# in that tree for a sequence of other numbers. +# +# The main point of this example is to demonstrate that even +# naive, pointer-chasing programs can be compiled to GPU kernels. + +"""A binary search tree node.""" +abstract type BinarySearchTreeNode{T} end + +"""An internal node of a binary search tree.""" +mutable struct InternalNode{T} <: BinarySearchTreeNode{T} + value::T + left::BinarySearchTreeNode{T} + right::BinarySearchTreeNode{T} +end + +InternalNode{T}(value::T) where T = InternalNode{T}(value, LeafNode{T}(), LeafNode{T}()) + +"""A leaf node of a binary search tree.""" +mutable struct LeafNode{T} <: BinarySearchTreeNode{T} end + +"""A binary search tree data structure.""" +mutable struct BinarySearchTree{T} + root::BinarySearchTreeNode{T} +end + +"""Creates an empty binary search tree.""" +BinarySearchTree{T}() where T = BinarySearchTree{T}(LeafNode{T}()) + +"""Tells if a binary search tree contains a particular element.""" +function haskey(tree::BinarySearchTree{T}, value::T)::Bool where T + walk = tree.root + while isa(walk, InternalNode{T}) + if walk.value == value + return true + elseif walk.value > value + walk = walk.right + else + walk = walk.left + end + end + return false +end + +"""Inserts an element into a binary search tree.""" +function insert!(tree::BinarySearchTree{T}, value::T) where T + if !isa(tree.root, InternalNode{T}) + tree.root = InternalNode{T}(value) + return + end + + walk = tree.root::InternalNode{T} + while true + if walk.value == value + return + elseif walk.value > value + right = walk.right + if isa(right, InternalNode{T}) + walk = right + else + walk.right = InternalNode{T}(value) + return + end + else + left = walk.left + if isa(left, InternalNode{T}) + walk = left + else + walk.left = InternalNode{T}(value) + return + end + end + end +end + +""" +Creates a binary search tree that contains elements copied from a device array. +""" +function BinarySearchTree{T}(elements::CUDAnative.DevicePtr{T}, size::Integer) where T + tree = BinarySearchTree{T}() + for i in 1:size + insert!(tree, unsafe_load(elements, i)) + end + tree +end + +""" +Creates a binary search tree that contains elements copied from an array. +""" +function BinarySearchTree{T}(elements::Array{T}) where T + tree = BinarySearchTree{T}() + for i in 1:length(elements) + insert!(tree, elements[i]) + end + tree +end + +# Gets a sequence of Fibonacci numbers. +function fibonacci(::Type{T}, count::Integer)::Array{T} where T + if count == 0 + return [] + elseif count == 1 + return [one(T)] + end + + results = [one(T), one(T)] + for i in 1:(count - 2) + push!(results, results[length(results) - 1] + results[length(results)]) + end + return results +end + +const number_count = 2000 +const thread_count = 32 +const tests_per_thread = 2000 + +# Define a kernel that copies values using a temporary buffer. +function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + tree = BinarySearchTree{Int64}(a, number_count) + + for j in 1:tests_per_thread + offset = (i - 1) * tests_per_thread + index = offset + j + unsafe_store!(b, haskey(tree, unsafe_load(b, index)), index) + end + + return +end + +# Generate a sequence of 64-bit truncated Fibonacci numbers. +number_set = fibonacci(Int64, number_count) +# Randomize the sequence's order. +shuffle!(number_set) + +# Generate numbers for which we will test membership in the sequence. +test_sequence = Array(1:(thread_count * tests_per_thread)) + +# Allocate two arrays. +source_array = Mem.alloc(Int64, length(number_set)) +destination_array = Mem.alloc(Int64, length(test_sequence)) +source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array) +destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + +# Fill the source and destination arrays. +Mem.upload!(source_array, number_set) +Mem.upload!(destination_array, test_sequence) + +# Run the kernel. +@cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) + +@test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence]) diff --git a/examples/gc.jl b/examples/gc.jl index 211a2fb4..38e2ff7e 100644 --- a/examples/gc.jl +++ b/examples/gc.jl @@ -1,5 +1,4 @@ -using CUDAdrv, CUDAnative, LLVM -using InteractiveUtils +using CUDAdrv, CUDAnative using Test mutable struct TempStruct From 3e2a8ff6c3c4f87e678cfc72004edec30905e62d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 11 Mar 2019 18:03:23 +0100 Subject: [PATCH 054/146] Use local arenas to reduce GC lock contention --- src/gc.jl | 134 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 28 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index a3e7d4b6..e29b4349 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -121,9 +121,6 @@ end # structure is designed to be immutable: it should not be changed # once the host has set it up. struct GCMasterRecord - # A pointer to the global GC arena. - global_arena::Ptr{GCArenaRecord} - # The number of warps. warp_count::UInt32 @@ -134,6 +131,15 @@ struct GCMasterRecord # of roots per thread. root_buffer_capacity::UInt32 + # The number of local arenas. + local_arena_count::UInt32 + + # A pointer to a list of local GC arena pointers. + local_arenas::Ptr{Ptr{GCArenaRecord}} + + # A pointer to the global GC arena. + global_arena::Ptr{GCArenaRecord} + # A pointer to a list of safepoint flags. Every warp has its # own flag. safepoint_flags::Ptr{SafepointState} @@ -149,6 +155,9 @@ end # Iterates through all arena pointers stored in a GC master record. @inline function iterate_arenas(fun::Function, master_record::GCMasterRecord) + for i in 1:master_record.local_arena_count + fun(unsafe_load(master_record.local_arenas, i)) + end fun(master_record.global_arena) end @@ -186,6 +195,19 @@ end return div(get_thread_id() - 1, warpsize()) + 1 end +# Gets a pointer to the local arena for this thread. This +# pointer may be null if there are no local arenas. +@inline function get_local_arena()::Ptr{GCArenaRecord} + master_record = get_gc_master_record() + if master_record.local_arena_count == UInt32(0) + return C_NULL + else + return unsafe_load( + master_record.local_arenas, + get_warp_id() % master_record.local_arena_count) + end +end + """ new_gc_frame(size::UInt32)::GCFrame @@ -411,7 +433,23 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} master_record = get_gc_master_record() # Try to malloc the object without host intervention. - ptr = @perma_safepoint @nocollect gc_malloc_local(master_record.global_arena, bytesize) + ptr = @perma_safepoint @nocollect begin + # Try to allocate in the local arena first. If that doesn't + # work, we'll move on to the global arena, which is bigger but + # is shared by all threads. (We want to minimize contention + # on the global arena's lock.) + local_arena = get_local_arena() + local_ptr = Base.unsafe_convert(Ptr{UInt8}, C_NULL) + if local_arena != C_NULL + local_ptr = gc_malloc_local(local_arena, bytesize) + end + + if local_ptr == C_NULL + gc_malloc_local(master_record.global_arena, bytesize) + else + local_ptr + end + end if ptr != C_NULL return ptr end @@ -423,6 +461,10 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} # first thread that acquires the interrupt lock, but it is quite # likely to succeed if we are *not* in the first thread that # acquired the garbage collector lock. + # + # Note: don't try to allocate in the local arena first because + # we have already acquired a device-wide lock. Allocating in + # the local arena first might waste precious time. ptr2 = gc_malloc_local(master_record.global_arena, bytesize) if ptr2 == C_NULL @@ -464,7 +506,7 @@ end # turn off collections. It can be called by the device, but in that # case it should be prefixed by the `@nocollect` macro followed by # a write lock acquisition on the arena's lock. -function gc_free_local_impl( +function gc_free_local( arena::Ptr{GCArenaRecord}, record_ptr::Ptr{Ptr{GCAllocationRecord}}) @@ -501,21 +543,32 @@ function gc_collect() writer_locked(gc_collect_impl, get_interrupt_lock()) end +# One megabyte. +const MiB = 1 << 20 + # The initial size of the GC heap, currently 16 MiB. -const initial_gc_heap_size = 16 * (1 << 20) +const initial_gc_heap_size = 16 * MiB # The default capacity of a root buffer, i.e., the max number of # roots that can be stored per thread. Currently set to # 256 roots. That's 2 KiB of roots per thread. const default_root_buffer_capacity = 256 -# The point at which an arena is deemed to be starving, i.e., +# The point at which the global arena is deemed to be starving, i.e., # it no longer contains enough memory to perform basic allocations. -# If an arena's free byte count stays below the arena starvation +# If the global arena's free byte count stays below the arena starvation # threshold after a collection phase, the collector will allocate # additional memory to the arena such that it is no longer starving. # The arena starvation threshold is currently set to 4 MiB. -const arena_starvation_threshold = 4 * (1 << 20) +const global_arena_starvation_threshold = 4 * MiB + +# The point at which a local arena is deemed to be starving, i.e., +# it no longer contains enough memory to perform basic allocations. +# If a local arena's free byte count stays below the arena starvation +# threshold after a collection phase, the collector will allocate +# additional memory to the arena such that it is no longer starving. +# The arena starvation threshold is currently set to 1 MiB. +const local_arena_starvation_threshold = 1 * MiB # A description of a region of memory that has been allocated to the GC heap. struct GCHeapRegion @@ -542,7 +595,8 @@ function gc_init!( heap::GCHeapDescription, thread_count::Integer; warp_count::Union{Integer, Nothing} = nothing, - root_buffer_capacity::Integer = default_root_buffer_capacity)::GCMasterRecord + root_buffer_capacity::Integer = default_root_buffer_capacity, + local_arena_count::Integer = 8)::GCMasterRecord if warp_count == nothing warp_count = thread_count / CUDAdrv.warpsize(device()) @@ -553,11 +607,15 @@ function gc_init!( gc_memory_start_ptr = master_region.start gc_memory_end_ptr = master_region.start + master_region.size - # Set up the safepoint flag buffer. + # Allocate a local arena pointer buffer. + local_arenas_bytesize = sizeof(Ptr{GCArenaRecord}) * local_arena_count + local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{GCArenaRecord}}, gc_memory_start_ptr) + + # Allocate the safepoint flag buffer. safepoint_bytesize = sizeof(SafepointState) * warp_count - safepoint_ptr = Base.unsafe_convert(Ptr{SafepointState}, gc_memory_start_ptr) + safepoint_ptr = Base.unsafe_convert(Ptr{SafepointState}, local_arenas_ptr + local_arenas_bytesize) - # Set up root buffers. + # Allocate root buffers. fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, safepoint_ptr + fingerbuf_bytesize) rootbuf_bytesize = sizeof(ObjectRef) * root_buffer_capacity * thread_count @@ -568,25 +626,26 @@ function gc_init!( unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * root_buffer_capacity, i) end - # Compute a pointer to the start of the heap. - heap_start_ptr = rootbuf_ptr + rootbuf_bytesize + # Compute a pointer to the start of the first arena. + arena_start_ptr = rootbuf_ptr + rootbuf_bytesize - # Create a single free list entry. - first_entry_ptr = make_gc_block!( - heap_start_ptr + sizeof(GCArenaRecord), - Csize_t(gc_memory_end_ptr) - Csize_t(heap_start_ptr) - sizeof(GCArenaRecord)) + # Set up local arenas. + for i in 1:local_arena_count + local_arena = make_gc_arena!(arena_start_ptr, Csize_t(local_arena_starvation_threshold)) + unsafe_store!(local_arenas_ptr, local_arena, i) + arena_start_ptr += local_arena_starvation_threshold + end - # Set up the main GC data structure. - global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, heap_start_ptr) - unsafe_store!( - global_arena, - GCArenaRecord(0, first_entry_ptr, C_NULL)) + # Set up the global arena. + global_arena = make_gc_arena!(arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr)) return GCMasterRecord( - global_arena, UInt32(warp_count), UInt32(thread_count), root_buffer_capacity, + UInt32(local_arena_count), + local_arenas_ptr, + global_arena, safepoint_ptr, fingerbuf_ptr, rootbuf_ptr) @@ -604,6 +663,19 @@ function make_gc_block!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{GCAllocationRecor return entry end +# Takes a zero-filled region of memory and turns it into an arena +# managed by the GC, prefixed with an arena record. +function make_gc_arena!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{GCArenaRecord} where T + # Create a single free list entry. + first_entry_ptr = make_gc_block!(start_ptr + sizeof(GCArenaRecord), size - sizeof(GCArenaRecord)) + + # Set up the arena record. + arena = Base.unsafe_convert(Ptr{GCArenaRecord}, start_ptr) + unsafe_store!( + arena, + GCArenaRecord(0, first_entry_ptr, C_NULL)) +end + # Tells if a GC heap contains a particular pointer. function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T for region in heap.regions @@ -854,7 +926,7 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) # We found a dead block. Release it. Don't proceed to the # next block because the current block will change in the # next iteration of this loop. - gc_free_local_impl(arena, record_ptr) + gc_free_local(arena, record_ptr) end end @@ -864,8 +936,14 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) # If the amount of free memory in the arena is below the starvation # limit then we'll expand the GC heap and add the additional memory # to the arena's free list. - if free_memory < arena_starvation_threshold - region = expand!(heap, arena_starvation_threshold) + threshold = if arena == master_record.global_arena + global_arena_starvation_threshold + else + local_arena_starvation_threshold + end + + if free_memory < threshold + region = expand!(heap, threshold) extra_record = make_gc_block!(region.start, region.size) last_free_list_ptr = @get_field_pointer(arena, :free_list_head) iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record From 48eb3f539b555e5974c28c4eeff4e833714ba5b7 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 11 Mar 2019 18:42:00 +0100 Subject: [PATCH 055/146] Automatically insert perma-safepoints --- src/compiler/optim.jl | 69 ++++++++++++++++++++++++++++++++++++++++--- src/device/runtime.jl | 3 +- src/gc.jl | 27 ++++++++++++++--- 3 files changed, 90 insertions(+), 9 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index f29a2a0c..cdd127de 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -11,7 +11,24 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int add_library_info!(pm, triple(mod)) add_transform_info!(pm, tm) if internalize - internalize!(pm, [LLVM.name(entry)]) + # We want to internalize functions so we can optimize + # them, but we don't really want to internalize globals + # because doing so may cause multiple copies of the same + # globals to appear after linking together modules. + # + # For example, the runtime library includes GC-related globals. + # It is imperative that these globals are shared by all modules, + # but if they are internalized before they are linked then + # they will actually not be internalized. + # + # Also, don't internalize the entry point, for obvious reasons. + non_internalizable_names = [LLVM.name(entry)] + for val in globals(mod) + if isa(val, LLVM.GlobalVariable) + push!(non_internalizable_names, LLVM.name(val)) + end + end + internalize!(pm, non_internalizable_names) end end @@ -71,7 +88,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int initialize!(pm) # lower intrinsics if ctx.gc - add!(pm, FunctionPass("InsertSafepointsGPUGC", insert_safepoints_gpugc!)) + add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry))) add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!)) else add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!)) @@ -616,12 +633,16 @@ function is_non_intrinsic_call(instruction::LLVM.Instruction) end """ - insert_safepoints_gpugc!(fun::LLVM.Function) + insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function) An LLVM pass that inserts GC safepoints in such a way that threads reach a safepoint after a reasonable amount of time. + +Moreover, this pass also inserts perma-safepoints after entry point returns. +Perma-safepoints inform the GC that it doesn't need to wait for a warp to +reach a safepoint; inserting them stops the GC from deadlocking. """ -function insert_safepoints_gpugc!(fun::LLVM.Function) +function insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function) # Insert a safepoint before every function call, but only for # functions that manage a GC frame. # @@ -644,6 +665,46 @@ function insert_safepoints_gpugc!(fun::LLVM.Function) dispose(builder) end end + + # Insert perma-safepoints if necessary. + if fun == entry + # Looks like we're going to have to insert perma-safepoints. + # We need to keep in mind that perma-safepoints are per-warp, + # so we absolutely cannot allow warps to be in a divergent + # state when a perma-safepoint is set---all bets are off if + # that happens anyway. + # + # To make sure that we don't end up in that situation, + # we will create a dedicated return block and replace all 'ret' + # instructions by jumps to that return block. + + # Create the dedicated return block. + return_block = BasicBlock(fun, "kernel_exit") + let builder = Builder(JuliaContext()) + position!(builder, return_block) + call!(builder, Runtime.get(:gc_perma_safepoint), LLVM.Value[]) + ret!(builder) + dispose(builder) + end + + # Rewrite return instructions as branches to the return bloc. + for block in blocks(fun) + if block == return_block + # We need to be careful not to trick ourselves into + # turning the return block's 'ret' into an infinite loop. + continue + end + term = terminator(block) + if isa(term, LLVM.RetInst) + unsafe_delete!(block, term) + let builder = Builder(JuliaContext()) + position!(builder, block) + br!(builder, return_block) + dispose(builder) + end + end + end + end return true end diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 1c492369..27589633 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -261,7 +261,8 @@ compile( () -> convert(LLVMType, Cvoid), () -> [T_pprjlvalue()]) -# Also import the safepoint function. +# Also import the safepoint and perma-safepoint functions. compile(CUDAnative.gc_safepoint, Cvoid, ()) +compile(CUDAnative.gc_perma_safepoint, Cvoid, ()) end diff --git a/src/gc.jl b/src/gc.jl index e29b4349..60b375a9 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -263,6 +263,24 @@ function gc_safepoint() return end +""" + gc_perma_safepoint() + +Signals that this warp has reached a GC perma-safepoint: +the GC doesn't need to wait for this warp to reach a safepoint +before starting collections. Instead, the GC may assume that +the warp is already in a safepoint. + +Be careful with this function: all bets are off when this +function is used improperly. For a more controlled (but still +super dangerous) way to use perma-safepoints, see the +`@perma_safepoint` macro. +""" +function gc_perma_safepoint() + gc_set_safepoint_flag(in_perma_safepoint) + return +end + # Sets this warp's safepoint flag to a particular state. function gc_set_safepoint_flag(value::SafepointState) master_record = get_gc_master_record() @@ -277,7 +295,7 @@ end # to include non-perma-safepoints. macro perma_safepoint(expr) quote - gc_set_safepoint_flag(in_perma_safepoint) + gc_perma_safepoint() local result = $(esc(expr)) gc_set_safepoint_flag(not_in_safepoint) result @@ -972,9 +990,10 @@ end High-level interface for executing code on a GPU with GC support. The `@cuda_gc` macro should prefix a call, with `func` a callable function or object that should return nothing. It will be compiled to a CUDA function upon first -use, and to a certain extent arguments will be converted and anaged automatically using -`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel -launch on the current CUDA context. +use, and to a certain extent arguments will be converted and managed automatically using +`cudaconvert`. Next, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel +launch on the current CUDA context. Finally, `@cuda_gc` waits for the kernel to finish, +performing garbage collection in the meantime if necessary. Several keyword arguments are supported that influence kernel compilation and execution. For more information, refer to the documentation of respectively [`cufunction`](@ref) and From 4a634e40ed5ea957c12e291ff28234d66357ac5a Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 11 Mar 2019 18:57:46 +0100 Subject: [PATCH 056/146] Add a comprehensive GC test --- test/device/gc.jl | 70 +++++++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 1 + 2 files changed, 71 insertions(+) create mode 100644 test/device/gc.jl diff --git a/test/device/gc.jl b/test/device/gc.jl new file mode 100644 index 00000000..1ec9b0fc --- /dev/null +++ b/test/device/gc.jl @@ -0,0 +1,70 @@ +@testset "gc" begin + +############################################################################################ + +dummy() = return + +dummy_handler(kernel) = return + +@testset "@cuda_gc" begin + +@testset "allocate and collect" begin + # This test allocates many very small and very large objects. Both the small + # and large objects become garbage eventually, but small objects need to + # outlive the large objects (and not be collected erroneously) for the test + # to pass. So essentially this test tackles three things: + # + # 1. Allocation works. + # 2. Collection works. + # 3. Collection isn't gung-ho to the point of incorrectness. + # + + mutable struct TempStruct + data::Float32 + end + + @noinline function escape(val) + Base.pointer_from_objref(val) + end + + # Define a kernel that copies values using a temporary struct. + function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + for j in 1:2 + # Allocate a mutable struct and make sure it ends up on the GC heap. + temp = TempStruct(unsafe_load(a, i)) + escape(temp) + + # Allocate a large garbage buffer to force collections. + gc_malloc(Csize_t(256 * 1024)) + + # Use the mutable struct. If its memory has been reclaimed (by accident) + # then we expect the test at the end of this file to fail. + unsafe_store!(b, temp.data, i) + end + + return + end + + thread_count = 64 + + # Allocate two arrays. + source_array = Mem.alloc(Float32, thread_count) + destination_array = Mem.alloc(Float32, thread_count) + source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array) + destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array) + + # Fill the source and destination arrays. + Mem.upload!(source_array, fill(42.f0, thread_count)) + Mem.upload!(destination_array, zeros(Float32, thread_count)) + + # Run the kernel. + @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) + + @test Mem.download(Float32, destination_array, thread_count) == fill(42.f0, thread_count) +end + +end + +end diff --git a/test/runtests.jl b/test/runtests.jl index 05e1687f..6cac0eb5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -71,6 +71,7 @@ if CUDAnative.configured include("device/cuda.jl") include("device/intrinsics.jl") include("device/threading.jl") + include("device/gc.jl") #include("examples.jl") end From d1ce8c7f57e10b559ac5ebee953e0594b12927df Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 15 Mar 2019 15:03:06 +0100 Subject: [PATCH 057/146] Do not serialize warps for reader locks --- src/device/threading.jl | 38 ++++++++++++++++++-------------------- src/gc.jl | 2 +- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/src/device/threading.jl b/src/device/threading.jl index 8bbeadf9..8723ebe4 100644 --- a/src/device/threading.jl +++ b/src/device/threading.jl @@ -107,28 +107,26 @@ Acquires a reader-writer lock in reader mode, runs `func` while the lock is acquired and releases the lock again. """ function reader_locked(func::Function, lock::ReaderWriterLock) - warp_serialized() do - while true - # Increment the reader count. If the lock is in write-acquired mode, - # then the lock will stay in that mode (unless the reader count is - # exceeded, but that is virtually impossible). Otherwise, the lock - # will end up in read-acquired mode. - previous_state = atomic_add!(lock.state_ptr, 1) - - # If the lock was in the idle or read-acquired state, then - # it is now in read-acquired mode. - if previous_state >= 0 - # Run the function. - result = func() - # Decrement the reader count to release the reader lock. - atomic_add!(lock.state_ptr, -1) - # We're done here. - return result - end - - # Decrement the reader count and try again. + while true + # Increment the reader count. If the lock is in write-acquired mode, + # then the lock will stay in that mode (unless the reader count is + # exceeded, but that is virtually impossible). Otherwise, the lock + # will end up in read-acquired mode. + previous_state = atomic_add!(lock.state_ptr, 1) + + # If the lock was in the idle or read-acquired state, then + # it is now in read-acquired mode. + if previous_state >= 0 + # Run the function. + result = func() + # Decrement the reader count to release the reader lock. atomic_add!(lock.state_ptr, -1) + # We're done here. + return result end + + # Decrement the reader count and try again. + atomic_add!(lock.state_ptr, -1) end end diff --git a/src/gc.jl b/src/gc.jl index 60b375a9..820bab11 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -204,7 +204,7 @@ end else return unsafe_load( master_record.local_arenas, - get_warp_id() % master_record.local_arena_count) + get_thread_id() % master_record.local_arena_count) end end From 513168118b3903c755b1a651431ef5bef1ffd124 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 15 Mar 2019 18:25:19 +0100 Subject: [PATCH 058/146] Define a GPU Mutex type --- examples/gc.jl | 12 +++- examples/lock.jl | 25 ++++++--- src/device/threading.jl | 119 +++++++++++++++++++++++++++++++++++++--- src/gc.jl | 10 ---- 4 files changed, 137 insertions(+), 29 deletions(-) diff --git a/examples/gc.jl b/examples/gc.jl index 38e2ff7e..51fe758e 100644 --- a/examples/gc.jl +++ b/examples/gc.jl @@ -9,14 +9,20 @@ end Base.pointer_from_objref(val) end -# Define a kernel that copies values using a temporary buffer. +# Define a kernel that copies values using a temporary struct. function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32}) - i = (blockIdx().x-1) * blockDim().x + threadIdx().x + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - for j in 1:256 + for j in 1:2 # Allocate a mutable struct and make sure it ends up on the GC heap. temp = TempStruct(unsafe_load(a, i)) escape(temp) + + # Allocate a large garbage buffer to force collections. + gc_malloc(Csize_t(256 * 1024)) + + # Use the mutable struct. If its memory has been reclaimed (by accident) + # then we expect the test at the end of this file to fail. unsafe_store!(b, temp.data, i) end diff --git a/examples/lock.jl b/examples/lock.jl index b4269a7b..1e06efdb 100644 --- a/examples/lock.jl +++ b/examples/lock.jl @@ -1,13 +1,20 @@ using CUDAdrv, CUDAnative using Test -thread_count = 128 +const thread_count = Int32(128) +const total_count = Int32(1024) # Define a kernel that atomically increments a counter using a lock. -function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState}) - lock = ReaderWriterLock(lock_state) - writer_locked(lock) do - unsafe_store!(counter, unsafe_load(counter) + 1) +function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.MutexState}) + lock = Mutex(lock_state) + done = false + while !done && try_lock(lock) + new_count = unsafe_load(counter) + 1 + unsafe_store!(counter, new_count) + if new_count == total_count + done = true + end + CUDAnative.unlock(lock) end return end @@ -17,9 +24,9 @@ counter_buf = Mem.alloc(sizeof(Int32)) Mem.upload!(counter_buf, [Int32(0)]) counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf) -lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState)) -Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)]) -lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf) +lock_buf = Mem.alloc(sizeof(CUDAnative.MutexState)) +Mem.upload!(lock_buf, [CUDAnative.MutexState(0)]) +lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.MutexState}, lock_buf) # @device_code_warntype increment_counter(counter_pointer, lock_pointer) @@ -28,4 +35,4 @@ lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock # Check that the counter's final value equals the number # of threads. -@test Mem.download(Int32, counter_buf) == [Int32(thread_count)] +@test Mem.download(Int32, counter_buf) == [Int32(total_count)] diff --git a/src/device/threading.jl b/src/device/threading.jl index 8723ebe4..951c20e8 100644 --- a/src/device/threading.jl +++ b/src/device/threading.jl @@ -1,6 +1,6 @@ # This file implements threading primitives that work for CUDAnative kernels. -export ReaderWriterLock, reader_locked, writer_locked +export ReaderWriterLock, reader_locked, writer_locked, Mutex, try_lock, unlock # Gets a pointer to a global with a particular name. If the global # does not exist yet, then it is declared in the global memory address @@ -17,19 +17,35 @@ export ReaderWriterLock, reader_locked, writer_locked :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new)) end -# Atomically adds a value to a variable pointed to by a pointer. -# Returns the previous value stored in that value. -@generated function atomic_add!(lhs::Ptr{T}, rhs::T)::T where T +@generated function atomic_rmw!(::Val{op}, lhs::Ptr{T}, rhs::T)::T where {op, T} ptr_type = convert(LLVMType, Ptr{T}) lt = string(convert(LLVMType, T)) ir = """ %ptr = inttoptr $ptr_type %0 to $lt* - %rv = atomicrmw volatile add $lt* %ptr, $lt %1 seq_cst + %rv = atomicrmw volatile $(String(op)) $lt* %ptr, $lt %1 seq_cst ret $lt %rv """ :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T}, lhs, rhs)) end +# Atomically adds a value to a variable pointed to by a pointer. +# Returns the previous value stored in that variable. +function atomic_add!(lhs::Ptr{T}, rhs::T)::T where T + atomic_rmw!(Val(:add), lhs, rhs) +end + +# Atomically computes the logical or of a value and a variable pointed +# to by a pointer. Returns the previous value stored in that variable. +function atomic_or!(lhs::Ptr{T}, rhs::T)::T where T + atomic_rmw!(Val(:or), lhs, rhs) +end + +# Atomically assigns a new value to a variable pointed to by a pointer. +# Returns the previous value stored in that variable. +function atomic_exchange!(lhs::Ptr{T}, rhs::T)::T where T + atomic_rmw!(Val(:xchg), lhs, rhs) +end + # Loads a value from a pointer. @generated function volatile_load(ptr::Ptr{T})::T where T ptr_type = string(convert(LLVMType, Ptr{T})) @@ -54,6 +70,10 @@ end :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value)) end +function unwrap_device_ptr(ptr::DevicePtr{T, A})::Ptr{T} where {T, A} + convert(Ptr{T}, convert(Csize_t, ptr)) +end + const ReaderWriterLockState = Int64 """ @@ -75,8 +95,8 @@ struct ReaderWriterLock state_ptr::Ptr{ReaderWriterLockState} end -ReaderWriterLock(state_ptr::DevicePtr{ReaderWriterLockState}) = ReaderWriterLock( - convert(Ptr{ReaderWriterLockState}, convert(Csize_t, state_ptr))) +ReaderWriterLock(state_ptr::DevicePtr{ReaderWriterLockState}) = + ReaderWriterLock(unwrap_device_ptr(state_ptr)) const max_rw_lock_readers = (1 << (sizeof(ReaderWriterLockState) * 8 - 1)) @@ -155,3 +175,88 @@ function writer_locked(func::Function, lock::ReaderWriterLock) return result end end + +# Gets the thread ID of the current thread. +@inline function get_thread_id() + return (blockIdx().x - 1) * blockDim().x + threadIdx().x +end + +# Gets the warp ID of the current thread. +@inline function get_warp_id() + return div(get_thread_id() - 1, warpsize()) + 1 +end + +const MutexState = UInt32 + +""" +A mutex: a lock that guarantees mutual exclusion. +""" +struct Mutex + # This GPU mutex implementation is based on + # Lock-based Synchronization for GPU Architectures + # by Yunlong Xu et al. + state_ptr::Ptr{MutexState} +end + +Mutex(state_ptr::DevicePtr{MutexState}) = + Mutex(unwrap_device_ptr(state_ptr)) + +""" + unlock(mutex::Mutex) + +Unlocks a mutex. +""" +function unlock(mutex::Mutex) + threadfence() + tid = get_thread_id() + atomic_compare_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1), UInt32(0)) + return +end + +""" + try_lock(mutex::Mutex)::Bool + +Tries to acquire a lock on a mutex. Returns `true` +if a lock was acquired successfully; otherwise, `false`. +""" +function try_lock(mutex::Mutex)::Bool + tid = UInt32(get_thread_id()) + wsize = warpsize() + threadbit = UInt32(1) << (tid % wsize) + + mask = vote_ballot(true) + + bitset = @cuStaticSharedMem(UInt32, 128) + bitset_ptr = unwrap_device_ptr(pointer(bitset)) + sizeof(UInt32) * div(threadIdx().x - 1, wsize) + unsafe_store!(bitset_ptr, UInt32(0)) + + lock = atomic_or!(mutex.state_ptr, UInt32(1)) + if lock & UInt32(1) == UInt32(0) + # The lock is free. + atomic_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1)) + else + pre_owner = lock >> 1 + if pre_owner != tid + if div(lock, wsize << 1) == div(tid, wsize) && pre_owner > tid && (((mask >> (pre_owner % wsize)) & UInt32(1)) == UInt32(1)) + atomic_or!(bitset_ptr, UInt32(1 << (pre_owner % wsize))) + atomic_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1)) + if (atomic_or!(mutex.state_ptr, UInt32(0)) >> 1) != tid + # Stealing failed. + atomic_or!(bitset_ptr, threadbit) + end + else + # Cannot steal. + atomic_or!(bitset_ptr, threadbit) + end + end + end + + if (unsafe_load(bitset_ptr) & threadbit) == UInt32(0) + threadfence() + return true + else + atomic_compare_exchange!(mutex.state_ptr, (tid << 1) + UInt32(1), UInt32(0)) + threadfence() + return false + end +end diff --git a/src/gc.jl b/src/gc.jl index 820bab11..193d6111 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -185,16 +185,6 @@ end return unsafe_load(@cuda_global_ptr("gc_master_record", GCMasterRecord)) end -# Gets the thread ID of the current thread. -@inline function get_thread_id() - return (blockIdx().x - 1) * blockDim().x + threadIdx().x -end - -# Gets the warp ID of the current thread. -@inline function get_warp_id() - return div(get_thread_id() - 1, warpsize()) + 1 -end - # Gets a pointer to the local arena for this thread. This # pointer may be null if there are no local arenas. @inline function get_local_arena()::Ptr{GCArenaRecord} From ce75ce85063bc3176b53449fe02b0644e07fe8ac Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Sun, 17 Mar 2019 18:23:59 +0100 Subject: [PATCH 059/146] Collect GC statistics --- src/gc.jl | 255 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 150 insertions(+), 105 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 193d6111..b3f01c10 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -45,7 +45,8 @@ export @cuda_gc, gc_malloc, gc_collect, gc_safepoint -import Base: length +import Base: length, show +import Printf: @sprintf # A data structure that precedes every chunk of memory that has been # allocated or put into the free list. @@ -607,7 +608,7 @@ function gc_init!( local_arena_count::Integer = 8)::GCMasterRecord if warp_count == nothing - warp_count = thread_count / CUDAdrv.warpsize(device()) + warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device())) end master_region = heap.regions[1] @@ -648,7 +649,7 @@ function gc_init!( global_arena = make_gc_arena!(arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr)) return GCMasterRecord( - UInt32(warp_count), + warp_count, UInt32(thread_count), root_buffer_capacity, UInt32(local_arena_count), @@ -841,125 +842,165 @@ function gc_compact_free_list(arena::Ptr{GCArenaRecord})::Csize_t return sum(record -> unsafe_load(record).size, records) end +"""A report of the GC's actions.""" +mutable struct GCReport + """The total wall-clock time of a kernel execution.""" + elapsed_time::Float64 + + """The number of collections that were performed.""" + collection_count::Int + + """The total wall-clock time of all collections.""" + collection_time::Float64 + + """The total amount of additional memory allocated to local pools.""" + extra_local_memory::Csize_t + + """The total amount of additional memory allocated to the global pool.""" + extra_global_memory::Csize_t + + GCReport() = new(0.0, 0, 0.0, Csize_t(0), Csize_t(0)) +end + +function show(io::IO, report::GCReport) + print(io, "[wall-clock time: $(@sprintf("%.4f", report.elapsed_time)) s; ") + print(io, "collections: $(report.collection_count); ") + collection_percentage = 100 * report.collection_time / report.elapsed_time + print(io, "total collection time: $(@sprintf("%.4f", report.collection_time)) s ($(@sprintf("%.2f", collection_percentage))%); ") + print(io, "extra local memory: $(div(report.extra_local_memory, MiB)) MiB; ") + print(io, "extra global memory: $(div(report.extra_global_memory, MiB)) MiB]") +end + # Collects garbage. This function is designed to be called by the host, # not by the device. -function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription) - # First off, we have to wait for all warps to reach a safepoint. Clear - # safepoint flags and wait for warps to set them again. - for i in 0:(master_record.warp_count - 1) - atomic_compare_exchange!( - master_record.safepoint_flags + i * sizeof(SafepointState), - in_safepoint, - not_in_safepoint) - end - safepoint_count = 0 - while safepoint_count != master_record.warp_count - safepoint_count = 0 +function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, report::GCReport) + collection_time = Base.@elapsed begin + # First off, we have to wait for all warps to reach a safepoint. Clear + # safepoint flags and wait for warps to set them again. for i in 0:(master_record.warp_count - 1) - state = volatile_load(master_record.safepoint_flags + i * sizeof(SafepointState)) - if state != not_in_safepoint - safepoint_count += 1 + atomic_compare_exchange!( + master_record.safepoint_flags + i * sizeof(SafepointState), + in_safepoint, + not_in_safepoint) + end + safepoint_count = 0 + while safepoint_count != master_record.warp_count + safepoint_count = 0 + for i in 0:(master_record.warp_count - 1) + state = volatile_load(master_record.safepoint_flags + i * sizeof(SafepointState)) + if state != not_in_safepoint + safepoint_count += 1 + end end end - end - # The Julia CPU GC is precise and the information it uses for precise - # garbage collection is stored in memory that we should be able to access. - # However, the way the CPU GC stores field information is incredibly - # complicated and replicating that logic here would be a royal pain to - # implement and maintain. Ideally, the CPU GC would expose an interface that - # allows us to point to an object and ask the GC for all GC-tracked pointers - # it contains. Alas, no such luck: the CPU GC doesn't even have an internal - # function that does that. The CPU GC's logic for finding GC-tracked pointer - # fields is instead fused tightly with its 'mark' loop. - # - # To cope with this, we will simply implement a semi-conservative GC: we precisely - # scan the roots for pointers into the GC heap. We then recursively mark blocks - # that are pointed to by such pointers as live and conservatively scan them for - # more pointers. - # - # Our mark phase is fairly simple: we maintain a worklist of pointers that - # are live and may need to be processed, as well as a set of blocks that are - # live and have already been processed. - live_blocks = Set{Ptr{GCAllocationRecord}}() - live_worklist = Ptr{ObjectRef}[] - - # Get a sorted allocation list, which will allow us to classify live pointers quickly. - alloc_list = sort_allocation_list(master_record) - - # Add all roots to the worklist. - for i in 1:(master_record.root_buffer_capacity * master_record.thread_count) - root = unsafe_load(master_record.root_buffers, i) - if root != C_NULL - push!(live_worklist, root) + # The Julia CPU GC is precise and the information it uses for precise + # garbage collection is stored in memory that we should be able to access. + # However, the way the CPU GC stores field information is incredibly + # complicated and replicating that logic here would be a royal pain to + # implement and maintain. Ideally, the CPU GC would expose an interface that + # allows us to point to an object and ask the GC for all GC-tracked pointers + # it contains. Alas, no such luck: the CPU GC doesn't even have an internal + # function that does that. The CPU GC's logic for finding GC-tracked pointer + # fields is instead fused tightly with its 'mark' loop. + # + # To cope with this, we will simply implement a semi-conservative GC: we precisely + # scan the roots for pointers into the GC heap. We then recursively mark blocks + # that are pointed to by such pointers as live and conservatively scan them for + # more pointers. + # + # Our mark phase is fairly simple: we maintain a worklist of pointers that + # are live and may need to be processed, as well as a set of blocks that are + # live and have already been processed. + live_blocks = Set{Ptr{GCAllocationRecord}}() + live_worklist = Ptr{ObjectRef}[] + + # Get a sorted allocation list, which will allow us to classify live pointers quickly. + alloc_list = sort_allocation_list(master_record) + + # Add all roots to the worklist. + for i in 1:(master_record.root_buffer_capacity * master_record.thread_count) + root = unsafe_load(master_record.root_buffers, i) + if root != C_NULL + push!(live_worklist, root) + end end - end - # Now process all live pointers until we reach a fixpoint. - while !isempty(live_worklist) - # Pop a pointer from the worklist. - object_ref = pop!(live_worklist) - # Get the block for that pointer. - record = get_record(alloc_list, object_ref) - # Make sure that we haven't visited the block yet. - if record != C_NULL && !(record in live_blocks) - # Mark the block as live. - push!(live_blocks, record) - # Add all pointer-sized, aligned values to the live pointer worklist. - block_pointer = data_pointer(record) - block_size = unsafe_load(record).size - for i in 0:sizeof(ObjectRef):(block_size - 1) - push!(live_worklist, Base.unsafe_convert(ObjectRef, block_pointer + i)) + # Now process all live pointers until we reach a fixpoint. + while !isempty(live_worklist) + # Pop a pointer from the worklist. + object_ref = pop!(live_worklist) + # Get the block for that pointer. + record = get_record(alloc_list, object_ref) + # Make sure that we haven't visited the block yet. + if record != C_NULL && !(record in live_blocks) + # Mark the block as live. + push!(live_blocks, record) + # Add all pointer-sized, aligned values to the live pointer worklist. + block_pointer = data_pointer(record) + block_size = unsafe_load(record).size + for i in 0:sizeof(ObjectRef):(block_size - 1) + push!(live_worklist, Base.unsafe_convert(ObjectRef, block_pointer + i)) + end end end - end - # We're done with the mark phase! Time to proceed to the sweep phase. - # The first thing we'll do is iterate through every arena's allocation list and - # free dead blocks. Next, we will compact and reorder free lists to combat - # fragmentation. - iterate_arenas(master_record) do arena - record_ptr = @get_field_pointer(arena, :allocation_list_head) - while true - record = unsafe_load(record_ptr) - if record == C_NULL - # We've reached the end of the list. - break + # We're done with the mark phase! Time to proceed to the sweep phase. + # The first thing we'll do is iterate through every arena's allocation list and + # free dead blocks. Next, we will compact and reorder free lists to combat + # fragmentation. + iterate_arenas(master_record) do arena + record_ptr = @get_field_pointer(arena, :allocation_list_head) + while true + record = unsafe_load(record_ptr) + if record == C_NULL + # We've reached the end of the list. + break + end + + if record in live_blocks + # We found a live block. Proceed to the next block. + record_ptr = @get_field_pointer(record, :next) + else + # We found a dead block. Release it. Don't proceed to the + # next block because the current block will change in the + # next iteration of this loop. + gc_free_local(arena, record_ptr) + end end - if record in live_blocks - # We found a live block. Proceed to the next block. - record_ptr = @get_field_pointer(record, :next) + # Compact the free list. + free_memory = gc_compact_free_list(arena) + + # If the amount of free memory in the arena is below the starvation + # limit then we'll expand the GC heap and add the additional memory + # to the arena's free list. + threshold = if arena == master_record.global_arena + global_arena_starvation_threshold else - # We found a dead block. Release it. Don't proceed to the - # next block because the current block will change in the - # next iteration of this loop. - gc_free_local(arena, record_ptr) + local_arena_starvation_threshold end - end - # Compact the free list. - free_memory = gc_compact_free_list(arena) + if free_memory < threshold + region = expand!(heap, threshold) - # If the amount of free memory in the arena is below the starvation - # limit then we'll expand the GC heap and add the additional memory - # to the arena's free list. - threshold = if arena == master_record.global_arena - global_arena_starvation_threshold - else - local_arena_starvation_threshold - end + if arena == master_record.global_arena + report.extra_global_memory += Csize_t(threshold) + else + report.extra_local_memory += Csize_t(threshold) + end - if free_memory < threshold - region = expand!(heap, threshold) - extra_record = make_gc_block!(region.start, region.size) - last_free_list_ptr = @get_field_pointer(arena, :free_list_head) - iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record - last_free_list_ptr = @get_field_pointer(record, :next) + extra_record = make_gc_block!(region.start, region.size) + last_free_list_ptr = @get_field_pointer(arena, :free_list_head) + iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record + last_free_list_ptr = @get_field_pointer(record, :next) + end + unsafe_store!(last_free_list_ptr, extra_record) end - unsafe_store!(last_free_list_ptr, extra_record) end end + report.collection_count += 1 + report.collection_time += collection_time end # Examines a keyword argument list and gets either the value @@ -1056,8 +1097,9 @@ macro cuda_gc(ex...) end end + local gc_report = GCReport() local function handle_interrupt() - gc_collect_impl(master_record, gc_heap) + gc_collect_impl(master_record, gc_heap, gc_report) end try @@ -1066,14 +1108,17 @@ macro cuda_gc(ex...) local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; gc = true, $(map(esc, compiler_kwargs)...)) CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...)) - kernel(kernel_args...; $(map(esc, call_kwargs)...)) + gc_report.elapsed_time = Base.@elapsed begin + kernel(kernel_args...; $(map(esc, call_kwargs)...)) - # Handle interrupts. - handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream))) + # Handle interrupts. + handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream))) + end finally free_shared_array(host_interrupt_array) free!(gc_heap) end + gc_report end end) return code From 8745e96ee4ebd2e0c96a052e44a35757da1218cb Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Sun, 17 Mar 2019 18:24:52 +0100 Subject: [PATCH 060/146] Add a matrix example --- examples/matrix.jl | 134 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 examples/matrix.jl diff --git a/examples/matrix.jl b/examples/matrix.jl new file mode 100644 index 00000000..a787171f --- /dev/null +++ b/examples/matrix.jl @@ -0,0 +1,134 @@ +# This example has kernels allocate dense symmetric matrices, fill them with Fibonacci numbers +# and compute their squares. The example is designed to stress the garbage allocator, specifically +# testing its ability to deal with many large objects. Furthermore, the example requires multiple +# collections to run to completion, so it also tests the performance of those collections. + +using StaticArrays, CUDAnative, CUDAdrv +import Base: getindex, setindex!, pointer, unsafe_convert, zeros +using InteractiveUtils + +const use_gc = true + +"""A fixed-size, heap-allocated array type for CUDAnative kernels.""" +struct FixedArray{T} + # The number of elements in the array. + size::Int + + # A pointer to the first element in the array. + # + # TODO: maybe protect this pointer from the GC somehow? + # At the moment, this pointer is protected automatically + # because the GC is conservative rather than precise. + ptr::Ptr{T} +end + +"""Allocates a heap-allocated array type and fills it with zeros.""" +function zeros(::Type{FixedArray{T}}, size::Int) where T + # Note: GC memory is always zero-initialized, so we don't + # actually have to fill the array with zeros. + bytesize = Csize_t(sizeof(T) * size) + buf = use_gc ? gc_malloc(bytesize) : CUDAnative.malloc(bytesize) + FixedArray{T}(size, unsafe_convert(Ptr{T}, buf)) +end + +"""Gets a pointer to the first element of a fixed-size array.""" +function pointer(array::FixedArray{T})::Ptr{T} where T + array.ptr +end + +function getindex(array::FixedArray{T}, i::Integer)::T where T + # TODO: bounds checking. + unsafe_load(pointer(array), i) +end + +function setindex!(array::FixedArray{T}, value::T, i::Integer) where T + # TODO: bounds checking. + unsafe_store!(pointer(array), value, i) +end + +"""A heap-allocated matrix type, suitable for CUDAnative kernels.""" +struct Matrix{Width, Height, T} + data::FixedArray{T} +end + +Matrix{Width, Height, T}() where {Width, Height, T} = + Matrix{Width, Height, T}(zeros(FixedArray{T}, Width * Height)) + +function pointer(matrix::Matrix{Width, Height, T})::Ptr{T} where {Width, Height, T} + pointer(matrix.data) +end + +function getindex(matrix::Matrix{Width, Height, T}, row::Int, column::Int) where {Width, Height, T} + getindex(matrix.data, (row - 1) * Width + column) +end + +function setindex!(matrix::Matrix{Width, Height, T}, value::T, row::Int, column::Int) where {Width, Height, T} + setindex!(matrix.data, value, (row - 1) * Width + column) +end + +const matrix_dim = 50 +const iterations = 20 +const thread_count = 256 + +function kernel(result::CUDAnative.DevicePtr{Int64}) + thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + accumulator = 0 + + for _ in 1:iterations + # Allocate a matrix. + matrix = Matrix{matrix_dim, matrix_dim, Int64}() + + # Fill it with Fibonacci numbers. + penultimate = 0 + ultimate = 1 + for i in 1:matrix_dim + for j in 1:matrix_dim + matrix[i, j] = ultimate + tmp = ultimate + ultimate = ultimate + penultimate + penultimate = tmp + end + end + + # Create a new element that contains the square of + # every element in `matrix`. + square = Matrix{matrix_dim, matrix_dim, Int64}() + for i in 1:matrix_dim + for j in 1:matrix_dim + square[i, j] = matrix[i, j] ^ 2 + end + end + + # Compute the sum of the squares. + square_sum = 0 + for i in 1:matrix_dim + for j in 1:matrix_dim + square_sum += square[i, j] + end + end + + # Add that sum to an accumulator. + accumulator += square_sum + end + + # Write the accumulator to the result array. + unsafe_store!(result, accumulator, thread_id) + + return +end + +destination_array = Mem.alloc(Int64, thread_count) +destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + +if use_gc + time = @cuda_gc threads=thread_count kernel(destination_pointer) + println(time) + time = @cuda_gc threads=thread_count kernel(destination_pointer) + println(time) +else + time = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer) + println(time) + time = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer) + println(time) +end From cc11f577b527bd0e7e032776eac3decc6d5487e3 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Sun, 17 Mar 2019 18:28:29 +0100 Subject: [PATCH 061/146] Amend binary tree example with a no-gc mode --- examples/binary-tree.jl | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/examples/binary-tree.jl b/examples/binary-tree.jl index 5fb0c19a..46db7d38 100644 --- a/examples/binary-tree.jl +++ b/examples/binary-tree.jl @@ -9,6 +9,8 @@ import Base: haskey, insert! # The main point of this example is to demonstrate that even # naive, pointer-chasing programs can be compiled to GPU kernels. +const use_gc = true + """A binary search tree node.""" abstract type BinarySearchTreeNode{T} end @@ -115,8 +117,8 @@ function fibonacci(::Type{T}, count::Integer)::Array{T} where T return results end -const number_count = 2000 -const thread_count = 32 +const number_count = 200 +const thread_count = 64 const tests_per_thread = 2000 # Define a kernel that copies values using a temporary buffer. @@ -152,7 +154,21 @@ destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) Mem.upload!(source_array, number_set) Mem.upload!(destination_array, test_sequence) -# Run the kernel. -@cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) +if use_gc + # Run the kernel. + @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) + + # Run it again. + Mem.upload!(destination_array, test_sequence) + stats = @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) +else + # Run the kernel. + @cuda threads=thread_count kernel(source_pointer, destination_pointer) + + # Run it again and time it this time. + Mem.upload!(destination_array, test_sequence) + stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer) +end +println(stats) @test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence]) From 2f340884c5c965b1c8006452eda4baff5db6e79f Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 18 Mar 2019 12:38:42 +0100 Subject: [PATCH 062/146] Measure GC polling times --- src/device/runtime.jl | 13 ++----------- src/gc.jl | 25 ++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 27589633..6b7c792e 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -232,17 +232,8 @@ function T_pprjlvalue() LLVM.PointerType(eltype(T_pjlvalue), Tracked)) end -""" - gc_malloc_object(bytesize::Csize_t) - -Allocates an object that is managed by the garbage collector. -This function is designed to be called by the device. -""" -function gc_malloc_object(bytesize::Csize_t) - return unsafe_pointer_to_objref(gc_malloc(bytesize)) -end - -compile(gc_malloc_object, Any, (Csize_t,), T_prjlvalue) +# Include the GC memory allocation function into the runtime. +compile(CUDAnative.gc_malloc_object, Any, (Csize_t,), T_prjlvalue) # Include GC frame management functions into the runtime. compile(CUDAnative.new_gc_frame, Any, (Cuint,), T_pprjlvalue) diff --git a/src/gc.jl b/src/gc.jl index b3f01c10..fb2008f6 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -43,7 +43,7 @@ # * When the device runs out of GC memory, it requests an interrupt # to mark and sweep. -export @cuda_gc, gc_malloc, gc_collect, gc_safepoint +export @cuda_gc, gc_malloc, gc_malloc_object, gc_collect, gc_safepoint import Base: length, show import Printf: @sprintf @@ -497,6 +497,16 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} return C_NULL end +""" + gc_malloc_object(bytesize::Csize_t) + +Allocates an object that is managed by the garbage collector. +This function is designed to be called by the device. +""" +function gc_malloc_object(bytesize::Csize_t) + unsafe_pointer_to_objref(gc_malloc(bytesize)) +end + # Zero-fills a range of memory. function zero_fill!(start_ptr::Ptr{UInt8}, size::Csize_t) ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), start_ptr, 0, size) @@ -850,6 +860,9 @@ mutable struct GCReport """The number of collections that were performed.""" collection_count::Int + """The total wall-clock time of all collection polls.""" + collection_poll_time::Float64 + """The total wall-clock time of all collections.""" collection_time::Float64 @@ -859,12 +872,14 @@ mutable struct GCReport """The total amount of additional memory allocated to the global pool.""" extra_global_memory::Csize_t - GCReport() = new(0.0, 0, 0.0, Csize_t(0), Csize_t(0)) + GCReport() = new(0.0, 0, 0.0, 0.0, Csize_t(0), Csize_t(0)) end function show(io::IO, report::GCReport) print(io, "[wall-clock time: $(@sprintf("%.4f", report.elapsed_time)) s; ") print(io, "collections: $(report.collection_count); ") + poll_percentage = 100 * report.collection_poll_time / report.elapsed_time + print(io, "total poll time: $(@sprintf("%.4f", report.collection_poll_time)) s ($(@sprintf("%.2f", poll_percentage))%); ") collection_percentage = 100 * report.collection_time / report.elapsed_time print(io, "total collection time: $(@sprintf("%.4f", report.collection_time)) s ($(@sprintf("%.2f", collection_percentage))%); ") print(io, "extra local memory: $(div(report.extra_local_memory, MiB)) MiB; ") @@ -874,7 +889,7 @@ end # Collects garbage. This function is designed to be called by the host, # not by the device. function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, report::GCReport) - collection_time = Base.@elapsed begin + poll_time = Base.@elapsed begin # First off, we have to wait for all warps to reach a safepoint. Clear # safepoint flags and wait for warps to set them again. for i in 0:(master_record.warp_count - 1) @@ -893,6 +908,9 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, end end end + end + + collection_time = Base.@elapsed begin # The Julia CPU GC is precise and the information it uses for precise # garbage collection is stored in memory that we should be able to access. @@ -1001,6 +1019,7 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, end report.collection_count += 1 report.collection_time += collection_time + report.collection_poll_time += poll_time end # Examines a keyword argument list and gets either the value From cec2dcc54340175af0893ca4d3b1646bab8b0a67 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 18 Mar 2019 13:15:12 +0100 Subject: [PATCH 063/146] Rename GC free list data structures --- src/gc.jl | 84 +++++++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index fb2008f6..2893cd7e 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -50,7 +50,7 @@ import Printf: @sprintf # A data structure that precedes every chunk of memory that has been # allocated or put into the free list. -struct GCAllocationRecord +struct FreeListRecord # The size of the memory region this allocation record precedes. # This size does not include the allocation record itself. size::Csize_t @@ -59,7 +59,7 @@ struct GCAllocationRecord # allocation record is part of the free list, then this pointer # points to the next free list entry; otherwise, it points to the # next entry in the list of allocated blocks. - next::Ptr{GCAllocationRecord} + next::Ptr{FreeListRecord} end @generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name} @@ -75,27 +75,27 @@ macro get_field_pointer(base_pointer, field_name) end # Gets a pointer to the first byte of data managed by an allocation record. -function data_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8} - Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(GCAllocationRecord) +function data_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8} + Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(FreeListRecord) end # Gets a pointer to the first byte of data no longer managed by an allocation record. -function data_end_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8} +function data_end_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8} data_pointer(record) + unsafe_load(@get_field_pointer(record, :size)) end # A data structure that describes a single GC "arena", i.e., # a section of the heap that is managed by the GC. Every arena # has its own free list and allocation list. -struct GCArenaRecord +struct FreeListArena # The allocation lock for the arena. lock_state::ReaderWriterLockState # The head of the free list. - free_list_head::Ptr{GCAllocationRecord} + free_list_head::Ptr{FreeListRecord} # The head of the allocation list. - allocation_list_head::Ptr{GCAllocationRecord} + allocation_list_head::Ptr{FreeListRecord} end # A reference to a Julia object. @@ -136,10 +136,10 @@ struct GCMasterRecord local_arena_count::UInt32 # A pointer to a list of local GC arena pointers. - local_arenas::Ptr{Ptr{GCArenaRecord}} + local_arenas::Ptr{Ptr{FreeListArena}} # A pointer to the global GC arena. - global_arena::Ptr{GCArenaRecord} + global_arena::Ptr{FreeListArena} # A pointer to a list of safepoint flags. Every warp has its # own flag. @@ -188,7 +188,7 @@ end # Gets a pointer to the local arena for this thread. This # pointer may be null if there are no local arenas. -@inline function get_local_arena()::Ptr{GCArenaRecord} +@inline function get_local_arena()::Ptr{FreeListArena} master_record = get_gc_master_record() if master_record.local_arena_count == UInt32(0) return C_NULL @@ -309,9 +309,9 @@ end # Tries to use a free-list entry to allocate a chunk of data of size `bytesize`. # Updates the free list if the allocation succeeds. Returns a null pointer otherwise. function gc_use_free_list_entry( - entry_ptr::Ptr{Ptr{GCAllocationRecord}}, - allocation_list_ptr::Ptr{Ptr{GCAllocationRecord}}, - entry::Ptr{GCAllocationRecord}, + entry_ptr::Ptr{Ptr{FreeListRecord}}, + allocation_list_ptr::Ptr{Ptr{FreeListRecord}}, + entry::Ptr{FreeListRecord}, bytesize::Csize_t,)::Ptr{UInt8} entry_data = unsafe_load(entry) @@ -333,7 +333,7 @@ function gc_use_free_list_entry( # prefixed by the block needs to be aligned to a 16-byte boundary, # but the block itself doesn't. new_data_address = align_to_boundary(data_address + bytesize) - new_entry_address = new_data_address - sizeof(GCAllocationRecord) + new_entry_address = new_data_address - sizeof(FreeListRecord) if new_entry_address < data_address + bytesize new_entry_address += gc_align new_data_address += gc_align @@ -344,10 +344,10 @@ function gc_use_free_list_entry( if new_data_address < end_address # Create a new free list entry. new_entry_size = Csize_t(end_address) - Csize_t(new_data_address) - new_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, new_entry_address) + new_entry_ptr = Base.unsafe_convert(Ptr{FreeListRecord}, new_entry_address) unsafe_store!( new_entry_ptr, - GCAllocationRecord(new_entry_size, entry_data.next)) + FreeListRecord(new_entry_size, entry_data.next)) # Update this entry's `size` field to reflect the new entry's space # requirements. @@ -369,7 +369,7 @@ function gc_use_free_list_entry( # Set the `next` pointer to the value stored at the allocation list pointer. unsafe_store!( - @get_field_pointer(entry, :next)::Ptr{Ptr{GCAllocationRecord}}, + @get_field_pointer(entry, :next)::Ptr{Ptr{FreeListRecord}}, unsafe_load(allocation_list_ptr)) # Update the allocation list pointer to point to the entry. @@ -387,8 +387,8 @@ end # # This function is not thread-safe. function gc_malloc_from_free_list( - free_list_ptr::Ptr{Ptr{GCAllocationRecord}}, - allocation_list_ptr::Ptr{Ptr{GCAllocationRecord}}, + free_list_ptr::Ptr{Ptr{FreeListRecord}}, + allocation_list_ptr::Ptr{Ptr{FreeListRecord}}, bytesize::Csize_t)::Ptr{UInt8} # To allocate memory, we will walk the free list until we find a suitable candidate. while free_list_ptr != C_NULL @@ -403,7 +403,7 @@ function gc_malloc_from_free_list( return result end - free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{GCAllocationRecord}} + free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{FreeListRecord}} end return C_NULL end @@ -411,13 +411,13 @@ end # Tries to allocate a chunk of memory in a particular GC arena. # Returns a null pointer if no sufficiently large chunk of # memory can be found. -function gc_malloc_local(arena::Ptr{GCArenaRecord}, bytesize::Csize_t)::Ptr{UInt8} +function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8} # Acquire the arena's lock. arena_lock = ReaderWriterLock(@get_field_pointer(arena, :lock_state)) result_ptr = writer_locked(arena_lock) do # Allocate a suitable region of memory. - free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}} - allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}} + free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{FreeListRecord}} + allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{FreeListRecord}} gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize) end @@ -526,8 +526,8 @@ end # case it should be prefixed by the `@nocollect` macro followed by # a write lock acquisition on the arena's lock. function gc_free_local( - arena::Ptr{GCArenaRecord}, - record_ptr::Ptr{Ptr{GCAllocationRecord}}) + arena::Ptr{FreeListArena}, + record_ptr::Ptr{Ptr{FreeListRecord}}) record = unsafe_load(record_ptr) next_record_ptr = @get_field_pointer(record, :next) @@ -627,8 +627,8 @@ function gc_init!( gc_memory_end_ptr = master_region.start + master_region.size # Allocate a local arena pointer buffer. - local_arenas_bytesize = sizeof(Ptr{GCArenaRecord}) * local_arena_count - local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{GCArenaRecord}}, gc_memory_start_ptr) + local_arenas_bytesize = sizeof(Ptr{FreeListArena}) * local_arena_count + local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{FreeListArena}}, gc_memory_start_ptr) # Allocate the safepoint flag buffer. safepoint_bytesize = sizeof(SafepointState) * warp_count @@ -672,11 +672,11 @@ end # Takes a zero-filled region of memory and turns it into a block # managed by the GC, prefixed with an allocation record. -function make_gc_block!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{GCAllocationRecord} where T - entry = Base.unsafe_convert(Ptr{GCAllocationRecord}, start_ptr) +function make_gc_block!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListRecord} where T + entry = Base.unsafe_convert(Ptr{FreeListRecord}, start_ptr) unsafe_store!( entry, - GCAllocationRecord( + FreeListRecord( Csize_t(start_ptr + size) - Csize_t(data_pointer(entry)), C_NULL)) return entry @@ -684,15 +684,15 @@ end # Takes a zero-filled region of memory and turns it into an arena # managed by the GC, prefixed with an arena record. -function make_gc_arena!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{GCArenaRecord} where T +function make_gc_arena!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListArena} where T # Create a single free list entry. - first_entry_ptr = make_gc_block!(start_ptr + sizeof(GCArenaRecord), size - sizeof(GCArenaRecord)) + first_entry_ptr = make_gc_block!(start_ptr + sizeof(FreeListArena), size - sizeof(FreeListArena)) # Set up the arena record. - arena = Base.unsafe_convert(Ptr{GCArenaRecord}, start_ptr) + arena = Base.unsafe_convert(Ptr{FreeListArena}, start_ptr) unsafe_store!( arena, - GCArenaRecord(0, first_entry_ptr, C_NULL)) + FreeListArena(0, first_entry_ptr, C_NULL)) end # Tells if a GC heap contains a particular pointer. @@ -728,7 +728,7 @@ end struct SortedAllocationList # An array of pointers to allocation records. The pointers # are all sorted. - records::Array{Ptr{GCAllocationRecord}, 1} + records::Array{Ptr{FreeListRecord}, 1} end length(alloc_list::SortedAllocationList) = length(alloc_list.records) @@ -738,9 +738,9 @@ length(alloc_list::SortedAllocationList) = length(alloc_list.records) # such record. function get_record( alloc_list::SortedAllocationList, - pointer::Ptr{T})::Ptr{GCAllocationRecord} where T + pointer::Ptr{T})::Ptr{FreeListRecord} where T - cast_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, pointer) + cast_ptr = Base.unsafe_convert(Ptr{FreeListRecord}, pointer) # Deal with the most common cases quickly. if length(alloc_list) == 0 || @@ -781,7 +781,7 @@ end # Iterates through a linked list of allocation records and apply a function # to every node in the linked list. The function is allowed to modify allocation # records. -@inline function iterate_allocation_records(fun::Function, head::Ptr{GCAllocationRecord}) +@inline function iterate_allocation_records(fun::Function, head::Ptr{FreeListRecord}) while head != C_NULL fun(head) head = unsafe_load(head).next @@ -807,9 +807,9 @@ end # 2. reorder free blocks to put small blocks at the front # of the free list, # 3. tally the total number of free bytes and return that number. -function gc_compact_free_list(arena::Ptr{GCArenaRecord})::Csize_t +function gc_compact_free_list(arena::Ptr{FreeListArena})::Csize_t # Let's start by creating a list of all free list records. - records = Ptr{GCAllocationRecord}[] + records = Ptr{FreeListRecord}[] free_list_head = unsafe_load(arena).free_list_head iterate_allocation_records(free_list_head) do record push!(records, record) @@ -930,7 +930,7 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, # Our mark phase is fairly simple: we maintain a worklist of pointers that # are live and may need to be processed, as well as a set of blocks that are # live and have already been processed. - live_blocks = Set{Ptr{GCAllocationRecord}}() + live_blocks = Set{Ptr{FreeListRecord}}() live_worklist = Ptr{ObjectRef}[] # Get a sorted allocation list, which will allow us to classify live pointers quickly. From f4cdf0b934d324faeb04a06ae9d242d1bea09980 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 18 Mar 2019 18:48:54 +0100 Subject: [PATCH 064/146] Implement a ScatterAlloc-based allocator --- src/compiler/optim.jl | 7 +- src/device/threading.jl | 6 + src/gc.jl | 411 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 395 insertions(+), 29 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index cdd127de..5ccf964a 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -651,14 +651,19 @@ function insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function) # API doesn't expose. if has_gc_frame(fun) + safepoint_function = Runtime.get(:gc_safepoint) let builder = Builder(JuliaContext()) for block in blocks(fun) for instruction in instructions(block) if is_non_intrinsic_call(instruction) + if called_value(instruction) == safepoint_function + continue + end + # Insert a safepoint just before the call. position!(builder, instruction) debuglocation!(builder, instruction) - call!(builder, Runtime.get(:gc_safepoint), LLVM.Value[]) + call!(builder, safepoint_function, LLVM.Value[]) end end end diff --git a/src/device/threading.jl b/src/device/threading.jl index 951c20e8..a7de7645 100644 --- a/src/device/threading.jl +++ b/src/device/threading.jl @@ -34,6 +34,12 @@ function atomic_add!(lhs::Ptr{T}, rhs::T)::T where T atomic_rmw!(Val(:add), lhs, rhs) end +# Atomically subtracts a value from a variable pointed to by a pointer. +# Returns the previous value stored in that variable. +function atomic_subtract!(lhs::Ptr{T}, rhs::T)::T where T + atomic_rmw!(Val(:sub), lhs, rhs) +end + # Atomically computes the logical or of a value and a variable pointed # to by a pointer. Returns the previous value stored in that variable. function atomic_or!(lhs::Ptr{T}, rhs::T)::T where T diff --git a/src/gc.jl b/src/gc.jl index 2893cd7e..5830c7f3 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -98,6 +98,125 @@ struct FreeListArena allocation_list_head::Ptr{FreeListRecord} end +# A data structure that describes a ScatterAlloc superblock. Every +# superblock is prefixed by one of these. +struct ScatterAllocSuperblock + # The number of regions in the superblock. + region_count::UInt32 + + # The number of pages in a region managed by this superblock. + pages_per_region::UInt32 + + # The size of a page in the superblock, in bytes. This size + # does not include the page's header. + page_size::UInt32 + + # A pointer to the next superblock. + next::Ptr{ScatterAllocSuperblock} +end + +# A region in a ScatterAlloc superblock. +struct ScatterAllocRegion + # The number of pages in this region that are full. + full_page_count::Int64 +end + +# A page in a ScatterAlloc region. +struct ScatterAllocPage + # The size of a chunk in this page. + chunk_size::Int64 + + # The number of allocated blocks in this page. + allocated_chunk_count::Int64 + + # A bitmask that describes which chunks have been allocated + # and which chunks are still free. + occupancy::Int64 +end + +const gc_align = Csize_t(16) + +# Aligns a pointer to an alignment boundary. +function align_downward(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T + address_int = Base.convert(Csize_t, address) + remainder = address_int % alignment + if remainder == Csize_t(0) + return address + else + return address + alignment - remainder + end +end + +# Aligns a pointer to an alignment boundary. +function align_upward(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T + result = align_downward(address, alignment) + if result < address + result += alignment + end + result +end + +# Aligns a pointer to an alignment boundary. +function align_upward(offset::T, alignment::Csize_t = gc_align)::T where T <: Integer + convert(T, Csize_t(align_upward(convert(Ptr{UInt8}, Csize_t(offset)), alignment))) +end + +# Gets the page size in a superblock. This size does not include +# the page header. +function page_size(superblock::Ptr{ScatterAllocSuperblock}) + unsafe_load(@get_field_pointer(superblock, :page_size)) +end + +# Gets the number of pages per region in a superblock. +function pages_per_region(superblock::Ptr{ScatterAllocSuperblock}) + unsafe_load(@get_field_pointer(superblock, :pages_per_region)) +end + +# Gets the size of an aligned header, including padding to satisfy +# alignment requirements. +@generated function header_size(::Type{T}, ::Val{alignment} = Val(gc_align))::UInt32 where {T, alignment} + result = align_upward(UInt32(sizeof(T)), alignment) + :($result) +end + +# Gets the total number of chunks in a particular page. +function chunk_count(page::Ptr{ScatterAllocPage}, superblock::Ptr{ScatterAllocSuperblock}) + chunk_size = unsafe_load(@get_field_pointer(page, :chunk_size)) + div(page_size(superblock), chunk_size) +end + +# Gets the address of a particular chunk in a page. `index` is zero-based. +function chunk_address(page::Ptr{ScatterAllocPage}, index::Integer)::Ptr{UInt8} + chunk_size = unsafe_load(@get_field_pointer(page, :chunk_size)) + Base.unsafe_convert(Ptr{UInt8}, page + header_size(ScatterAllocPage) + chunk_size * index) +end + +# Gets the address of a particular page in a region. `index` is zero-based. +function page_address(region::Ptr{ScatterAllocRegion}, superblock::Ptr{ScatterAllocSuperblock}, index::Integer)::Ptr{ScatterAllocPage} + Base.unsafe_convert( + Ptr{ScatterAllocPage}, + region + header_size(ScatterAllocRegion) + index * (header_size(ScatterAllocPage) + page_size(superblock))) +end + +# Gets the total size in bytes of a region, including overhead. +function region_bytesize(pages_per_region::Integer, page_size::Integer) + region_data_size = pages_per_region * (header_size(ScatterAllocPage) + page_size) + header_size(ScatterAllocRegion) + region_data_size +end + +# Gets the address of a particular region in a superblock. `index` is zero-based. +function region_address(superblock::Ptr{ScatterAllocSuperblock}, index::Integer)::Ptr{ScatterAllocRegion} + Base.unsafe_convert( + Ptr{ScatterAllocPage}, + superblock + header_size(ScatterAllocSuperblock) + index * region_bytesize(pages_per_region(superblock), page_size(superblock))) +end + +# A GC arena that uses the ScatterAlloc algorithm for allocations. +struct ScatterAllocArena + # A pointer to the first superblock managed by this arena. + first_superblock::Ptr{ScatterAllocSuperblock} +end + # A reference to a Julia object. const ObjectRef = Ptr{Nothing} @@ -135,6 +254,10 @@ struct GCMasterRecord # The number of local arenas. local_arena_count::UInt32 + # A pointer to the tiny arena, which uses the ScatterAlloc + # algorithm to provision space for small objects. + tiny_arena::Ptr{ScatterAllocArena} + # A pointer to a list of local GC arena pointers. local_arenas::Ptr{Ptr{FreeListArena}} @@ -293,19 +416,6 @@ macro perma_safepoint(expr) end end -const gc_align = Csize_t(16) - -# Aligns a pointer to an alignment boundary. -function align_to_boundary(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T - address_int = Base.convert(Csize_t, address) - remainder = address_int % alignment - if remainder == Csize_t(0) - return address - else - return address + alignment - remainder - end -end - # Tries to use a free-list entry to allocate a chunk of data of size `bytesize`. # Updates the free list if the allocation succeeds. Returns a null pointer otherwise. function gc_use_free_list_entry( @@ -332,7 +442,7 @@ function gc_use_free_list_entry( # Compute the start address of the new free list entry. The data # prefixed by the block needs to be aligned to a 16-byte boundary, # but the block itself doesn't. - new_data_address = align_to_boundary(data_address + bytesize) + new_data_address = align_downward(data_address + bytesize) new_entry_address = new_data_address - sizeof(FreeListRecord) if new_entry_address < data_address + bytesize new_entry_address += gc_align @@ -378,6 +488,181 @@ function gc_use_free_list_entry( return data_address end +# Tries to allocate a chunk of memory from a ScatterAlloc page. +# Returns a null pointer if no chunk of memory can be found. +function gc_scatter_alloc_use_page( + page::Ptr{ScatterAllocPage}, + region::Ptr{ScatterAllocRegion}, + superblock::Ptr{ScatterAllocSuperblock})::Ptr{UInt8} + + alloc_chunk_ptr = @get_field_pointer(page, :allocated_chunk_count) + fill_level = atomic_add!(alloc_chunk_ptr, 1) + spots = chunk_count(page, superblock) + if fill_level < spots + if fill_level + 1 == spots + # The page is full now. Increment the region's counter. + full_page_ptr = @get_field_pointer(region, :full_page_count) + atomic_add!(full_page_ptr, 1) + end + + lane_id = (get_thread_id() - 1) % warpsize() + spot = lane_id % spots + occupancy_ptr = @get_field_pointer(page, :occupancy) + while true + # Check if our preferred spot is available. + mask = 1 << spot + old = atomic_or!(occupancy_ptr, mask) + + actual_fill = 0 + for i in 1:64 + if old & (1 << (i - 1)) != 0 + actual_fill += 1 + end + end + + # If the spot is available, then use it. + if old & mask == 0 + break + end + + # Otherwise, find a new spot. + spot = (spot + 1) % spots + end + return chunk_address(page, spot) + end + + # The page is full. + atomic_subtract!(alloc_chunk_ptr, 1) + return C_NULL +end + +function scatter_alloc_hash( + superblock::Ptr{ScatterAllocSuperblock}, + bytesize::Int64)::Int64 + + sb = unsafe_load(superblock) + page_count = sb.region_count * sb.pages_per_region + warp_id = get_warp_id() - 1 + + k_S = 38183 + k_mp = 17497 + + (bytesize * k_S + warp_id * k_mp) % page_count +end + +# Tries to allocate a chunk of memory from a ScatterAlloc superblock. +# Returns a null pointer if no sufficiently large chunk of +# memory can be found. +function gc_scatter_alloc_use_superblock( + superblock::Ptr{ScatterAllocSuperblock}, + bytesize::Csize_t)::Ptr{UInt8} + + if bytesize > page_size(superblock) + # This isn't going to work. The superblock's page size is just too small. + return C_NULL + end + + # Choose the allocation size in such a way that we never end up with more than + # 64 chunks. This is necessary because the chunk occupancy bitfield is only + # 64 bits wide. + alloc_size = Int64(div(page_size(superblock), 64)) + if alloc_size < Int64(bytesize) + alloc_size = Int64(bytesize) + end + + # Align the allocation size. + alloc_size = align_upward(alloc_size) + + # We are looking for a chunk that is `bytesize` bytes in size, + # but we're willing to accept a chunk that is twice as large. + waste_factor = 2 + max_size = alloc_size * waste_factor + + pages_per_region = unsafe_load(@get_field_pointer(superblock, :pages_per_region)) + region_count = unsafe_load(@get_field_pointer(superblock, :region_count)) + + # Guess a global page index. + global_page_id = scatter_alloc_hash(superblock, alloc_size) + + # Decompose that global page index into a region index and a + # local page index. + region_id = global_page_id % pages_per_region + page_id = div(global_page_id, pages_per_region) + + # Remember the initial values of the region and page ids. + init_region_id = region_id + init_page_id = page_id + + # Find the region and page corresponding to the current page ID. + region = region_address(superblock, region_id) + while true + page = page_address(region, superblock, page_id) + + # Skip regions until we find a region that is sufficiently empty. + while true + region_fill_level = unsafe_load(region).full_page_count / pages_per_region + if region_fill_level > 0.9 + region_id += 1 + if region_id >= region_count + region_id = 0 + end + region = region_address(superblock, region_id) + page_id = 0 + else + break + end + end + + # Try to set the chunk size to our preferred chunk size. + chunk_size_ptr = @get_field_pointer(page, :chunk_size) + chunk_size = atomic_compare_exchange!(chunk_size_ptr, 0, alloc_size) + if chunk_size == 0 || (chunk_size >= alloc_size && chunk_size <= max_size) + # If we managed to set the page's chunk size, then the page is definitely + # suitable for our purposes. Otherwise, the page might still be suitable + # if its chunk size is sufficiently large to accommodate the requested + # size yet small enough to not waste too much space. + result = gc_scatter_alloc_use_page(page, region, superblock) + if result != C_NULL + return result + end + end + + # Try the next page. + page_id += 1 + + if page_id >= pages_per_region + region_id += 1 + if region_id >= region_count + region_id = 0 + end + region = region_address(superblock, region_id) + page_id = 0 + end + + # We tried every page in the entire superblock and found nothing. + if region_id == init_region_id && page_id == init_page_id + return C_NULL + end + end +end + +# Tries to allocate a chunk of memory in a particular GC arena. +# Returns a null pointer if no sufficiently large chunk of +# memory can be found. +function gc_malloc_local(arena::Ptr{ScatterAllocArena}, bytesize::Csize_t)::Ptr{UInt8} + # Walk the list of superblocks until we find a valid candidate. + superblock = unsafe_load(arena).first_superblock + while superblock != C_NULL + result = gc_scatter_alloc_use_superblock(superblock, bytesize) + if result != C_NULL + return result + end + superblock = unsafe_load(@get_field_pointer(superblock, :next)) + end + + return C_NULL +end + # Tries to allocate a chunk of memory from a free list. # Returns a null pointer if no sufficiently large chunk of # memory can be found. @@ -441,24 +726,35 @@ This function is designed to be called by the device. function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} master_record = get_gc_master_record() - # Try to malloc the object without host intervention. - ptr = @perma_safepoint @nocollect begin - # Try to allocate in the local arena first. If that doesn't + function allocate() + # Try to allocate in the tiny arena first. The ScatterAlloc + # algorithm used by that arena is lock-free and works well + # for small objects. + if master_record.tiny_arena != C_NULL + local_ptr = gc_malloc_local(master_record.tiny_arena, bytesize) + if local_ptr != C_NULL + return local_ptr + end + end + + # Try to allocate in the local arena second. If that doesn't # work, we'll move on to the global arena, which is bigger but # is shared by all threads. (We want to minimize contention # on the global arena's lock.) local_arena = get_local_arena() - local_ptr = Base.unsafe_convert(Ptr{UInt8}, C_NULL) if local_arena != C_NULL local_ptr = gc_malloc_local(local_arena, bytesize) + if local_ptr != C_NULL + return local_ptr + end end - if local_ptr == C_NULL - gc_malloc_local(master_record.global_arena, bytesize) - else - local_ptr - end + # Try to use the global arena if all else fails. + gc_malloc_local(master_record.global_arena, bytesize) end + + # Try to malloc the object without host intervention. + ptr = @perma_safepoint @nocollect allocate() if ptr != C_NULL return ptr end @@ -565,7 +861,7 @@ end # One megabyte. const MiB = 1 << 20 -# The initial size of the GC heap, currently 16 MiB. +# The initial size of the GC heap, currently 20 MiB. const initial_gc_heap_size = 16 * MiB # The default capacity of a root buffer, i.e., the max number of @@ -589,6 +885,14 @@ const global_arena_starvation_threshold = 4 * MiB # The arena starvation threshold is currently set to 1 MiB. const local_arena_starvation_threshold = 1 * MiB +# The point at which a tiny arena is deemed to be starving, i.e., +# it no longer contains enough memory to perform basic allocations. +# If a tiny arena's free byte count stays below the arena starvation +# threshold after a collection phase, the collector will allocate +# additional memory to the arena such that it is no longer starving. +# This arena starvation threshold is currently set to 2 MiB. +const tiny_arena_starvation_threshold = 0 # 2 * MiB + # A description of a region of memory that has been allocated to the GC heap. struct GCHeapRegion # A buffer that contains the GC region's bytes. @@ -645,24 +949,33 @@ function gc_init!( unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * root_buffer_capacity, i) end - # Compute a pointer to the start of the first arena. + # Compute a pointer to the start of the tiny arena. arena_start_ptr = rootbuf_ptr + rootbuf_bytesize + # Set up the tiny object arena. + if tiny_arena_starvation_threshold > 0 + arena_for_ants = make_gc_arena!(ScatterAllocArena, arena_start_ptr, Csize_t(tiny_arena_starvation_threshold)) + arena_start_ptr += tiny_arena_starvation_threshold + else + arena_for_ants = Base.unsafe_convert(Ptr{ScatterAllocArena}, C_NULL) + end + # Set up local arenas. for i in 1:local_arena_count - local_arena = make_gc_arena!(arena_start_ptr, Csize_t(local_arena_starvation_threshold)) + local_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold)) unsafe_store!(local_arenas_ptr, local_arena, i) arena_start_ptr += local_arena_starvation_threshold end # Set up the global arena. - global_arena = make_gc_arena!(arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr)) + global_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr)) return GCMasterRecord( warp_count, UInt32(thread_count), root_buffer_capacity, UInt32(local_arena_count), + arena_for_ants, local_arenas_ptr, global_arena, safepoint_ptr, @@ -684,7 +997,7 @@ end # Takes a zero-filled region of memory and turns it into an arena # managed by the GC, prefixed with an arena record. -function make_gc_arena!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListArena} where T +function make_gc_arena!(::Type{FreeListArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListArena} where T # Create a single free list entry. first_entry_ptr = make_gc_block!(start_ptr + sizeof(FreeListArena), size - sizeof(FreeListArena)) @@ -693,6 +1006,48 @@ function make_gc_arena!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListArena} wh unsafe_store!( arena, FreeListArena(0, first_entry_ptr, C_NULL)) + + arena +end + +# Takes a zero-filled region of memory and turns it into a ScatterAlloc +# superblock. +function make_gc_superblock!( + start_ptr::Ptr{T}, + size::Csize_t; + page_size::UInt32 = UInt32(2048), + pages_per_region::UInt32 = UInt32(16))::Ptr{ScatterAllocSuperblock} where T + + region_size = region_bytesize(pages_per_region, page_size) + + # Figure out how many regions we can allocate. + region_count = div(size - header_size(ScatterAllocSuperblock), region_size) + + # At this point, we'd normally allocate regions and pages. + # However, region and page headers are zero-initialized by default. + # So we don't actually need to do anything to set up the regions + # and pages. + + # Allocate the superblock header. + superblock = Base.unsafe_convert(Ptr{ScatterAllocSuperblock}, align_upward(start_ptr)) + unsafe_store!( + superblock, + ScatterAllocSuperblock(region_count, pages_per_region, page_size, C_NULL)) + + superblock +end + +# Takes a zero-filled region of memory and turns it into an arena +# managed by the GC, prefixed with an arena record. +function make_gc_arena!(::Type{ScatterAllocArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{ScatterAllocArena} where T + superblock_ptr = align_upward(start_ptr + sizeof(ScatterAllocArena)) + superblock = make_gc_superblock!(superblock_ptr, Csize_t(start_ptr) + size - Csize_t(superblock_ptr)) + arena = Base.unsafe_convert(Ptr{ScatterAllocArena}, start_ptr) + unsafe_store!( + arena, + ScatterAllocArena(superblock)) + + arena end # Tells if a GC heap contains a particular pointer. From 6e14a2d5cb94a66931ec9eaa230e3b7c185b598c Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 19 Mar 2019 16:18:07 +0100 Subject: [PATCH 065/146] Make the allocator smarter --- src/gc.jl | 400 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 348 insertions(+), 52 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 5830c7f3..07f5004d 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -98,6 +98,9 @@ struct FreeListArena allocation_list_head::Ptr{FreeListRecord} end +# Gets a free list arena's lock. +get_lock(arena::Ptr{FreeListArena}) = ReaderWriterLock(@get_field_pointer(arena, :lock_state)) + # A data structure that describes a ScatterAlloc superblock. Every # superblock is prefixed by one of these. struct ScatterAllocSuperblock @@ -217,6 +220,66 @@ struct ScatterAllocArena first_superblock::Ptr{ScatterAllocSuperblock} end +# A "shelf" in a bodega arena. See `BodegaArena` for more info on +# how shelves work. +struct BodegaShelf + # The size of the chunks on this shelf. + chunk_size::Csize_t + + # The maximal number of chunks on this shelf. + capacity::Int64 + + # An index into the shelf that points to the first free + # chunk. This is a zero-based index. + chunk_finger::Int64 + + # A pointer to an array of pointers to chunks of memory. + # Every chunk in this array has a chunk size that is + # at least as large as `chunk_size`. + chunks::Ptr{Ptr{UInt8}} +end + +# A GC arena that uses a custom ("bodega") allocation algorithm for allocations. +# Essentially, this type of arena has a list of "shelves" that contain small, +# preallocated chunks of memory that threads can claim in a fast and lock-free +# manner. When the shelves run out of memory, threads may re-stock them from free +# list, amortizing the cost of lock acquisition across many different allocations. +struct BodegaArena + # The number of shelves in the arena. + shelf_count::Int + + # A pointer to an array of shelves. + shelves::Ptr{BodegaShelf} + + # A Boolean that tells if it is sensible to try and restock shelves in this + # arena. Restocking shelves becomes futile once the free list's capacity is + # exhausted. + can_restock::Bool + + # The free list this bodega uses for large allocations and for re-stocking + # the shelves. + free_list::FreeListArena +end + +# Gets a pointer to a bodega arena's free list. +function get_free_list(arena::Ptr{BodegaArena})::Ptr{FreeListArena} + @get_field_pointer(arena, :free_list) +end + +# Gets the first shelf containing chunks that are at least `bytesize` bytes +# in size. Returns null if there is no such shelf. +function get_shelf(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{BodegaShelf} + bodega = unsafe_load(arena) + for i in 1:bodega.shelf_count + shelf = bodega.shelves + (i - 1) * sizeof(BodegaShelf) + chunk_size = unsafe_load(@get_field_pointer(shelf, :chunk_size)) + if chunk_size >= bytesize + return shelf + end + end + return C_NULL +end + # A reference to a Julia object. const ObjectRef = Ptr{Nothing} @@ -259,10 +322,10 @@ struct GCMasterRecord tiny_arena::Ptr{ScatterAllocArena} # A pointer to a list of local GC arena pointers. - local_arenas::Ptr{Ptr{FreeListArena}} + local_arenas::Ptr{Ptr{BodegaArena}} # A pointer to the global GC arena. - global_arena::Ptr{FreeListArena} + global_arena::Ptr{BodegaArena} # A pointer to a list of safepoint flags. Every warp has its # own flag. @@ -311,14 +374,14 @@ end # Gets a pointer to the local arena for this thread. This # pointer may be null if there are no local arenas. -@inline function get_local_arena()::Ptr{FreeListArena} +@inline function get_local_arena()::Ptr{BodegaArena} master_record = get_gc_master_record() if master_record.local_arena_count == UInt32(0) return C_NULL else return unsafe_load( master_record.local_arenas, - get_thread_id() % master_record.local_arena_count) + get_warp_id() % master_record.local_arena_count) end end @@ -422,7 +485,7 @@ function gc_use_free_list_entry( entry_ptr::Ptr{Ptr{FreeListRecord}}, allocation_list_ptr::Ptr{Ptr{FreeListRecord}}, entry::Ptr{FreeListRecord}, - bytesize::Csize_t,)::Ptr{UInt8} + bytesize::Csize_t)::Ptr{UInt8} entry_data = unsafe_load(entry) if entry_data.size < bytesize @@ -693,27 +756,137 @@ function gc_malloc_from_free_list( return C_NULL end +# Tries to allocate a chunk of memory from a free list. +# Returns a null pointer if no sufficiently large chunk of +# memory can be found. +# +# This function is not thread-safe. +function gc_malloc_from_free_list(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8} + free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{FreeListRecord}} + allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{FreeListRecord}} + gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize) +end + +# Writes a pointer to a temporary GC frame. This will keep the pointer +# from getting collected until the caller has a chance to add it to its +# own GC frame. +function gc_protect(pointer::Ptr{UInt8}) + if pointer != Base.unsafe_convert(Ptr{UInt8}, C_NULL) + gc_frame = new_gc_frame(UInt32(1)) + unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, pointer)) + end +end + # Tries to allocate a chunk of memory in a particular GC arena. # Returns a null pointer if no sufficiently large chunk of # memory can be found. function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8} # Acquire the arena's lock. - arena_lock = ReaderWriterLock(@get_field_pointer(arena, :lock_state)) - result_ptr = writer_locked(arena_lock) do + result_ptr = writer_locked(get_lock(arena)) do # Allocate a suitable region of memory. - free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{FreeListRecord}} - allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{FreeListRecord}} - gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize) + gc_malloc_from_free_list(arena, bytesize) end # If the resulting pointer is non-null, then we'll write it to a temporary GC frame. # Our reasoning for doing this is that doing so ensures that the allocated memory # won't get collected by the GC before the caller has a chance to add it to its # own GC frame. - if result_ptr != Base.unsafe_convert(Ptr{UInt8}, C_NULL) - gc_frame = new_gc_frame(UInt32(1)) - unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, result_ptr)) + gc_protect(result_ptr) + return result_ptr +end + +# Atomically takes a chunk from a shelf. Returns null if the shelf +# is empty. +function gc_malloc_from_shelf(shelf::Ptr{BodegaShelf})::Ptr{UInt8} + capacity = unsafe_load(@get_field_pointer(shelf, :capacity)) + + # Atomically increment the chunk finger. + finger_ptr = @get_field_pointer(shelf, :chunk_finger) + finger = atomic_add!(finger_ptr, 1) + + if finger < capacity + # If the chunk finger was less than the capacity, then we actually + # managed to take a chunk from the shelf. We only need to retrieve + # its address. + chunk_array = unsafe_load(@get_field_pointer(shelf, :chunks)) + return unsafe_load(chunk_array, finger + 1) + else + # Otherwise, we've got nothing. Return null. + return C_NULL + end +end + +# Re-stocks a shelf. +function restock_shelf(arena::Ptr{BodegaArena}, shelf::Ptr{BodegaShelf}) + shelf_size = unsafe_load(@get_field_pointer(shelf, :chunk_size)) + capacity = unsafe_load(@get_field_pointer(shelf, :capacity)) + finger_ptr = @get_field_pointer(shelf, :chunk_finger) + finger = unsafe_load(finger_ptr) + + # The finger may exceed the capacity. This is harmless. Just + # reset the finger to the capacity. + if finger > capacity + finger = capacity + end + + # Actually re-stock the shelf. + free_list = get_free_list(arena) + chunk_array = unsafe_load(@get_field_pointer(shelf, :chunks)) + while finger > 0 + chunk = gc_malloc_from_free_list(free_list, shelf_size) + if chunk == C_NULL + # We exhausted the free list. Better break now. Also set + # the arena's `can_restock` flag to false so there will be + # no future attempts to re-stock shelves. + unsafe_store!(@get_field_pointer(arena, :can_restock), false) + break + end + + # Update the chunk array. + unsafe_store!(chunk_array, chunk, finger) + finger -= 1 end + + # Update the finger. + unsafe_store!(finger_ptr, finger) +end + +# Tries to allocate a chunk of memory in a particular GC arena. +# Returns a null pointer if no sufficiently large chunk of +# memory can be found. +function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{UInt8} + # The bodega arena might be empty (or approximately empty). If so, then we'll + # just return null early. There's no need to scrape the bottom of the barrel. + if !unsafe_load(@get_field_pointer(arena, :can_restock)) + return C_NULL + end + + # Find the right shelf for this allocation. + shelf = get_shelf(arena, bytesize) + free_list = get_free_list(arena) + if shelf == C_NULL + # The shelves' chunk sizes are all too small to accommodate this + # allocation. Use the free list directly. + return gc_malloc_local(free_list, bytesize) + end + + # Acquire a reader lock on the arena and try to take a chunk + # from the shelf. + lock = get_lock(free_list) + result_ptr = reader_locked(lock) do + gc_malloc_from_shelf(shelf) + end + + if result_ptr == C_NULL + # Looks like we need to re-stock the shelf. While we're at it, + # we might as well grab a chunk of memory for ourselves. + result_ptr = writer_locked(lock) do + restock_shelf(arena, shelf) + gc_malloc_from_free_list(free_list, bytesize) + end + end + + gc_protect(result_ptr) return result_ptr end @@ -931,8 +1104,8 @@ function gc_init!( gc_memory_end_ptr = master_region.start + master_region.size # Allocate a local arena pointer buffer. - local_arenas_bytesize = sizeof(Ptr{FreeListArena}) * local_arena_count - local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{FreeListArena}}, gc_memory_start_ptr) + local_arenas_bytesize = sizeof(Ptr{BodegaArena}) * local_arena_count + local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{BodegaArena}}, gc_memory_start_ptr) # Allocate the safepoint flag buffer. safepoint_bytesize = sizeof(SafepointState) * warp_count @@ -962,13 +1135,13 @@ function gc_init!( # Set up local arenas. for i in 1:local_arena_count - local_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold)) + local_arena = make_gc_arena!(BodegaArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold)) unsafe_store!(local_arenas_ptr, local_arena, i) arena_start_ptr += local_arena_starvation_threshold end # Set up the global arena. - global_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr)) + global_arena = make_gc_arena!(BodegaArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr)) return GCMasterRecord( warp_count, @@ -1010,6 +1183,49 @@ function make_gc_arena!(::Type{FreeListArena}, start_ptr::Ptr{T}, size::Csize_t) arena end +# Takes a zero-filled region of memory and turns it into an arena +# managed by the GC, prefixed with an arena record. +function make_gc_arena!(::Type{BodegaArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{BodegaArena} where T + current_ptr = start_ptr + sizeof(BodegaArena) + + # Set up some shelf chunk arrays + shelf_records = [] + for chunk_size in [32, 64] + capacity = 2048 + shelf_chunk_array = Base.unsafe_convert(Ptr{Ptr{UInt8}}, current_ptr) + current_ptr += capacity * sizeof(Ptr{UInt8}) + push!(shelf_records, BodegaShelf(Csize_t(chunk_size), capacity, capacity, shelf_chunk_array)) + end + + # Set up the shelves. + shelf_array = Base.unsafe_convert(Ptr{BodegaShelf}, current_ptr) + for record in shelf_records + shelf = Base.unsafe_convert(Ptr{BodegaShelf}, current_ptr) + current_ptr += sizeof(BodegaShelf) + unsafe_store!(shelf, record) + end + + # Set up a free list entry. + first_entry_ptr = make_gc_block!(current_ptr, Csize_t(start_ptr + size) - Csize_t(current_ptr)) + + # Set up the arena record. + arena = Base.unsafe_convert(Ptr{BodegaArena}, start_ptr) + unsafe_store!( + arena, + BodegaArena( + length(shelf_records), + shelf_array, + true, + FreeListArena(0, first_entry_ptr, C_NULL))) + + # Stock the shelves. + for record in shelf_records + restock_shelf(arena, get_shelf(arena, record.chunk_size)) + end + + arena +end + # Takes a zero-filled region of memory and turns it into a ScatterAlloc # superblock. function make_gc_superblock!( @@ -1134,22 +1350,54 @@ function get_record( end # Iterates through a linked list of allocation records and apply a function -# to every node in the linked list. The function is allowed to modify allocation -# records. -@inline function iterate_allocation_records(fun::Function, head::Ptr{FreeListRecord}) +# to every node in the linked list. +function iterate_allocation_records(fun::Function, head::Ptr{FreeListRecord}) while head != C_NULL fun(head) head = unsafe_load(head).next end end +# Iterates through all active allocation records in a GC arena. +function iterate_allocated(fun::Function, arena::Ptr{FreeListArena}) + allocation_list_head = unsafe_load(arena).allocation_list_head + iterate_allocation_records(fun, allocation_list_head) +end + +# Iterates through all active allocation records in a GC arena. +function iterate_allocated(fun::Function, arena::Ptr{BodegaArena}) + # Compose a set that contains all data addresses of chunks that + # are on the shelves. + arena_data = unsafe_load(arena) + chunks_on_shelves = Set{Ptr{UInt8}}() + for i in 1:arena_data.shelf_count + shelf = unsafe_load(arena_data.shelves, i) + for j in shelf.chunk_finger:(shelf.capacity - 1) + push!(chunks_on_shelves, unsafe_load(shelf.chunks, j)) + end + end + + # Now iterate through the allocation list, ignoring records that have + # been placed on the shelves. + iterate_allocated(get_free_list(arena)) do record + if !(data_pointer(record) in chunks_on_shelves) + fun(record) + end + end +end + +# Iterates through all free allocation records in a GC arena. +function iterate_free(fun::Function, arena::Ptr{FreeListArena}) + free_list_head = unsafe_load(arena).free_list_head + iterate_allocation_records(fun, free_list_head) +end + # Takes a GC master record and constructs a sorted allocation list # based on it. function sort_allocation_list(master_record::GCMasterRecord)::SortedAllocationList records = [] iterate_arenas(master_record) do arena - allocation_list_head = unsafe_load(arena).allocation_list_head - iterate_allocation_records(allocation_list_head) do record + iterate_allocated(arena) do record push!(records, record) end end @@ -1157,16 +1405,46 @@ function sort_allocation_list(master_record::GCMasterRecord)::SortedAllocationLi return SortedAllocationList(records) end +# Frees all dead blocks in an arena. +function gc_free_garbage(arena::Ptr{FreeListArena}, live_blocks::Set{Ptr{FreeListRecord}}) + record_ptr = @get_field_pointer(arena, :allocation_list_head) + while true + record = unsafe_load(record_ptr) + if record == C_NULL + # We've reached the end of the list. + break + end + + if record in live_blocks + # We found a live block. Proceed to the next block. + record_ptr = @get_field_pointer(record, :next) + else + # We found a dead block. Release it. Don't proceed to the + # next block because the current block will change in the + # next iteration of this loop. + gc_free_local(arena, record_ptr) + end + end +end + +# Frees all dead blocks in an arena. +function gc_free_garbage(arena::Ptr{BodegaArena}, live_blocks::Set{Ptr{FreeListRecord}}) + # Free garbage in the free list sub-arena. + gc_free_garbage(get_free_list(arena), live_blocks) + + # Mark the arena as ready for restocking. + unsafe_store!(@get_field_pointer(arena, :can_restock), true) +end + # Compact a GC arena's free list. This function will # 1. merge adjancent free blocks, and # 2. reorder free blocks to put small blocks at the front # of the free list, # 3. tally the total number of free bytes and return that number. -function gc_compact_free_list(arena::Ptr{FreeListArena})::Csize_t +function gc_compact(arena::Ptr{FreeListArena})::Csize_t # Let's start by creating a list of all free list records. records = Ptr{FreeListRecord}[] - free_list_head = unsafe_load(arena).free_list_head - iterate_allocation_records(free_list_head) do record + iterate_free(arena) do record push!(records, record) end @@ -1207,6 +1485,46 @@ function gc_compact_free_list(arena::Ptr{FreeListArena})::Csize_t return sum(record -> unsafe_load(record).size, records) end +# Compact a GC arena's free list. This function will +# 1. merge adjancent free blocks, and +# 2. reorder free blocks to put small blocks at the front +# of the free list, +# 3. tally the total number of free bytes and return that number. +function gc_compact(arena::Ptr{BodegaArena})::Csize_t + # Compact the free list. + tally = gc_compact(get_free_list(arena)) + + # Add the size of the chunks on shelves to the tally. + shelf_count = unsafe_load(@get_field_pointer(arena, :shelf_count)) + for i in 1:shelf_count + shelf_array = unsafe_load(@get_field_pointer(arena, :shelves)) + shelf_data = unsafe_load(shelf_array, i) + + finger = shelf_data.chunk_finger + if finger > shelf_data.capacity + finger = shelf_data.capacity + end + tally += shelf_data.chunk_size * (shelf_data.capacity - finger) + end + + tally +end + +# Expands a GC arena by assigning it an additional heap region. +function gc_expand(arena::Ptr{FreeListArena}, region::GCHeapRegion) + extra_record = make_gc_block!(region.start, region.size) + last_free_list_ptr = @get_field_pointer(arena, :free_list_head) + iterate_free(arena) do record + last_free_list_ptr = @get_field_pointer(record, :next) + end + unsafe_store!(last_free_list_ptr, extra_record) +end + +# Expands a GC arena by assigning it an additional heap region. +function gc_expand(arena::Ptr{BodegaArena}, region::GCHeapRegion) + gc_expand(get_free_list(arena), region) +end + """A report of the GC's actions.""" mutable struct GCReport """The total wall-clock time of a kernel execution.""" @@ -1323,27 +1641,11 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, # free dead blocks. Next, we will compact and reorder free lists to combat # fragmentation. iterate_arenas(master_record) do arena - record_ptr = @get_field_pointer(arena, :allocation_list_head) - while true - record = unsafe_load(record_ptr) - if record == C_NULL - # We've reached the end of the list. - break - end + # Free garbage blocks. + gc_free_garbage(arena, live_blocks) - if record in live_blocks - # We found a live block. Proceed to the next block. - record_ptr = @get_field_pointer(record, :next) - else - # We found a dead block. Release it. Don't proceed to the - # next block because the current block will change in the - # next iteration of this loop. - gc_free_local(arena, record_ptr) - end - end - - # Compact the free list. - free_memory = gc_compact_free_list(arena) + # Compact the arena. + free_memory = gc_compact(arena) # If the amount of free memory in the arena is below the starvation # limit then we'll expand the GC heap and add the additional memory @@ -1356,19 +1658,13 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, if free_memory < threshold region = expand!(heap, threshold) + gc_expand(arena, region) if arena == master_record.global_arena report.extra_global_memory += Csize_t(threshold) else report.extra_local_memory += Csize_t(threshold) end - - extra_record = make_gc_block!(region.start, region.size) - last_free_list_ptr = @get_field_pointer(arena, :free_list_head) - iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record - last_free_list_ptr = @get_field_pointer(record, :next) - end - unsafe_store!(last_free_list_ptr, extra_record) end end end From 699fcea7304509f35c3e3f1e44035c592624fd64 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 20 Mar 2019 10:49:22 +0100 Subject: [PATCH 066/146] Tweak GC memory hierarchy --- src/gc.jl | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 07f5004d..742559d0 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -325,7 +325,7 @@ struct GCMasterRecord local_arenas::Ptr{Ptr{BodegaArena}} # A pointer to the global GC arena. - global_arena::Ptr{BodegaArena} + global_arena::Ptr{FreeListArena} # A pointer to a list of safepoint flags. Every warp has its # own flag. @@ -435,7 +435,7 @@ Signals that this warp has reached a GC safepoint. """ function gc_safepoint() wait_for_interrupt() do - gc_set_safepoint_flag(in_safepoint) + gc_set_safepoint_flag(in_safepoint; overwrite = false) end return end @@ -459,11 +459,15 @@ function gc_perma_safepoint() end # Sets this warp's safepoint flag to a particular state. -function gc_set_safepoint_flag(value::SafepointState) +function gc_set_safepoint_flag(value::SafepointState; overwrite::Bool = true) master_record = get_gc_master_record() warp_id = get_warp_id() safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointState) * (warp_id - 1) - volatile_store!(safepoint_flag_ptr, value) + if overwrite + volatile_store!(safepoint_flag_ptr, value) + else + atomic_compare_exchange!(safepoint_flag_ptr, not_in_safepoint, value) + end return end @@ -922,8 +926,16 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} end end - # Try to use the global arena if all else fails. - gc_malloc_local(master_record.global_arena, bytesize) + # Try to use the global arena if all else fails, but only if the chunk + # of memory we want to allocate is sufficiently large. Allocating lots of + # small chunks in the global arena will result in undue contention and slow + # down kernels dramatically. + if bytesize >= 1024 + local_ptr = gc_malloc_local(master_record.global_arena, bytesize) + else + local_ptr = C_NULL + end + return local_ptr end # Try to malloc the object without host intervention. @@ -939,11 +951,7 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} # first thread that acquires the interrupt lock, but it is quite # likely to succeed if we are *not* in the first thread that # acquired the garbage collector lock. - # - # Note: don't try to allocate in the local arena first because - # we have already acquired a device-wide lock. Allocating in - # the local arena first might waste precious time. - ptr2 = gc_malloc_local(master_record.global_arena, bytesize) + ptr2 = allocate() if ptr2 == C_NULL # We are either the first thread to acquire the interrupt lock @@ -1035,7 +1043,7 @@ end const MiB = 1 << 20 # The initial size of the GC heap, currently 20 MiB. -const initial_gc_heap_size = 16 * MiB +const initial_gc_heap_size = 20 * MiB # The default capacity of a root buffer, i.e., the max number of # roots that can be stored per thread. Currently set to @@ -1055,8 +1063,8 @@ const global_arena_starvation_threshold = 4 * MiB # If a local arena's free byte count stays below the arena starvation # threshold after a collection phase, the collector will allocate # additional memory to the arena such that it is no longer starving. -# The arena starvation threshold is currently set to 1 MiB. -const local_arena_starvation_threshold = 1 * MiB +# The arena starvation threshold is currently set to 2 MiB. +const local_arena_starvation_threshold = 2 * MiB # The point at which a tiny arena is deemed to be starving, i.e., # it no longer contains enough memory to perform basic allocations. @@ -1141,7 +1149,7 @@ function gc_init!( end # Set up the global arena. - global_arena = make_gc_arena!(BodegaArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr)) + global_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr)) return GCMasterRecord( warp_count, From bdd6c0b3a6a4210aff9a7fa31ad15b1501adc28f Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 20 Mar 2019 10:49:31 +0100 Subject: [PATCH 067/146] Create a linked list example --- examples/linked-list.jl | 78 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 examples/linked-list.jl diff --git a/examples/linked-list.jl b/examples/linked-list.jl new file mode 100644 index 00000000..b0eb958a --- /dev/null +++ b/examples/linked-list.jl @@ -0,0 +1,78 @@ +using CUDAnative, CUDAdrv +using Test +import Base: foldl, reduce, sum + +# This test constructs a linked list in a GPU kernel. + +use_gc = true + +abstract type List{T} +end + +mutable struct Nil{T} <: List{T} +end + +mutable struct Cons{T} <: List{T} + value::T + next::List{T} +end + +Cons{T}(value::T) where T = Cons{T}(value, Nil{T}()) + +function List{T}(pointer, count::Integer) where T + result = Nil{T}() + for i in count:-1:1 + result = Cons{T}(unsafe_load(pointer, i), result) + end + result +end + +function foldl(op, list::List{T}; init) where T + node = list + accumulator = init + while isa(node, Cons{T}) + accumulator = op(accumulator, node.value) + node = node.next + end + accumulator +end + +function reduce(op, list::List{T}; init) where T + foldl(op, list; init=init) +end + +function sum(list::List{T}) where T + reduce(+, list; init=zero(T)) +end + +const element_count = 200 +const thread_count = 256 + +function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + l = List{Int64}(elements, element_count) + unsafe_store!(results, sum(l), i) + return +end + +# Allocate two arrays. +source_array = Mem.alloc(Int64, element_count) +destination_array = Mem.alloc(Int64, thread_count) +source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array) +destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + +# Fill the source and destination arrays. +Mem.upload!(source_array, Array(1:element_count)) +Mem.upload!(destination_array, zeros(Int64, thread_count)) + +# Run the kernel. +if use_gc + @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) + stats = @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) +else + @cuda threads=thread_count kernel(source_pointer, destination_pointer) + stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer) +end +println(stats) + +@test Mem.download(Int64, destination_array, thread_count) == repeat([sum(1:element_count)], thread_count) From 1b93aba6b406e3eaf87df3d34df4de490f0e6480 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 21 Mar 2019 12:55:09 +0100 Subject: [PATCH 068/146] Fix imperfect rebase --- src/CUDAnative.jl | 4 +--- src/compiler/optim.jl | 16 ++++++---------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index 85ea5ef9..8006ac8e 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -31,8 +31,6 @@ include(joinpath("device", "array.jl")) include(joinpath("device", "cuda.jl")) include(joinpath("device", "llvm.jl")) include(joinpath("device", "runtime.jl")) -include(joinpath("device", "libdevice.jl")) -include(joinpath("device", "cuda_intrinsics.jl")) include(joinpath("device", "threading.jl")) # The interrupts and GC files need to be loaded _before_ the @@ -40,7 +38,7 @@ include(joinpath("device", "threading.jl")) # depend on the GC and the GC depends on interrupts. include("interrupts.jl") include("gc.jl") -include(joinpath("device", "runtime_intrinsics.jl")) +include(joinpath("device", "runtime.jl")) include("compiler.jl") include("execution.jl") diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 5ccf964a..cac90195 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -70,7 +70,12 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int ModulePassManager() do pm initialize!(pm) - add!(pm, ModulePass("FinalLowerGCGPU", lower_final_gc_intrinsics!)) + if ctx.gc + add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry))) + add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!)) + else + add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!)) + end aggressive_dce!(pm) # remove dead uses of ptls add!(pm, ModulePass("LowerPTLS", lower_ptls!)) @@ -86,15 +91,6 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int # PTX-specific optimizations ModulePassManager() do pm initialize!(pm) - # lower intrinsics - if ctx.gc - add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry))) - add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!)) - else - add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!)) - end - aggressive_dce!(pm) # remove dead uses of ptls - add!(pm, ModulePass("LowerPTLS", lower_ptls!)) # NVPTX's target machine info enables runtime unrolling, # but Julia's pass sequence only invokes the simple unroller. From bb03af28f9dfc9e33016a44672feff0092b0b97a Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 21 Mar 2019 16:06:57 +0100 Subject: [PATCH 069/146] Add a StaticArrays-based GC example --- examples/matrix-static-arrays.jl | 40 ++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 examples/matrix-static-arrays.jl diff --git a/examples/matrix-static-arrays.jl b/examples/matrix-static-arrays.jl new file mode 100644 index 00000000..12e54e98 --- /dev/null +++ b/examples/matrix-static-arrays.jl @@ -0,0 +1,40 @@ +using StaticArrays, CUDAnative, CUDAdrv + +use_gc = false + +const matrix_dim = 40 +const iterations = 20 +const thread_count = 256 + +function fill() + m = zeros(MMatrix{matrix_dim, matrix_dim, Int64}) + + for i in 1:matrix_dim + for j in 1:matrix_dim + m[i, j] = i * j + end + end + + return m +end + +function kernel(result::CUDAnative.DevicePtr{Int64}) + thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + # Write the accumulator to the result array. + unsafe_store!(result, fill()[20, 30], thread_id) + + return +end + +destination_array = Mem.alloc(Int64, thread_count) +destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + +if use_gc + @cuda_gc threads=thread_count kernel(destination_pointer) + stats = @cuda_gc threads=thread_count kernel(destination_pointer) +else + @cuda threads=thread_count kernel(destination_pointer) + stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer) +end +println(stats) From 46614f3854f0b4811e7a49688f8092c823872004 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 3 Apr 2019 13:10:58 +0200 Subject: [PATCH 070/146] Teach allocator to transfer memory block ownership --- src/device/threading.jl | 16 +++-- src/gc.jl | 141 ++++++++++++++++++++++++++++++---------- 2 files changed, 118 insertions(+), 39 deletions(-) diff --git a/src/device/threading.jl b/src/device/threading.jl index a7de7645..846db990 100644 --- a/src/device/threading.jl +++ b/src/device/threading.jl @@ -127,12 +127,16 @@ function warp_serialized(func::Function) end """ - reader_locked(func::Function, lock::ReaderWriterLock) + reader_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true) Acquires a reader-writer lock in reader mode, runs `func` while the lock is acquired and releases the lock again. """ -function reader_locked(func::Function, lock::ReaderWriterLock) +function reader_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true) + if !acquire_lock + return func() + end + while true # Increment the reader count. If the lock is in write-acquired mode, # then the lock will stay in that mode (unless the reader count is @@ -157,12 +161,16 @@ function reader_locked(func::Function, lock::ReaderWriterLock) end """ - writer_locked(func::Function, lock::ReaderWriterLock) + writer_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true) Acquires a reader-writer lock in writer mode, runs `func` while the lock is acquired and releases the lock again. """ -function writer_locked(func::Function, lock::ReaderWriterLock) +function writer_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true) + if !acquire_lock + return func() + end + warp_serialized() do # Try to move the lock from 'idle' to 'write-acquired'. while atomic_compare_exchange!(lock.state_ptr, 0, -max_rw_lock_readers) != 0 diff --git a/src/gc.jl b/src/gc.jl index 742559d0..4df493a6 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -266,6 +266,9 @@ function get_free_list(arena::Ptr{BodegaArena})::Ptr{FreeListArena} @get_field_pointer(arena, :free_list) end +# Gets a bodega arena's lock. +get_lock(arena::Ptr{BodegaArena}) = get_lock(get_free_list(arena)) + # Gets the first shelf containing chunks that are at least `bytesize` bytes # in size. Returns null if there is no such shelf. function get_shelf(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{BodegaShelf} @@ -377,11 +380,11 @@ end @inline function get_local_arena()::Ptr{BodegaArena} master_record = get_gc_master_record() if master_record.local_arena_count == UInt32(0) - return C_NULL + return Base.unsafe_convert(Ptr{BodegaArena}, C_NULL) else return unsafe_load( master_record.local_arenas, - get_warp_id() % master_record.local_arena_count) + ((get_warp_id() - 1) % master_record.local_arena_count) + 1) end end @@ -483,13 +486,13 @@ macro perma_safepoint(expr) end end -# Tries to use a free-list entry to allocate a chunk of data of size `bytesize`. -# Updates the free list if the allocation succeeds. Returns a null pointer otherwise. -function gc_use_free_list_entry( +# Tries to use a free-list entry to allocate a chunk of data of size `bytesize`, +# producing an appropriately-sized free list entry that prefixes the data. This +# entry is removed from the free list but not yet added to the allocation list. +function gc_take_list_entry( entry_ptr::Ptr{Ptr{FreeListRecord}}, - allocation_list_ptr::Ptr{Ptr{FreeListRecord}}, entry::Ptr{FreeListRecord}, - bytesize::Csize_t)::Ptr{UInt8} + bytesize::Csize_t)::Ptr{FreeListRecord} entry_data = unsafe_load(entry) if entry_data.size < bytesize @@ -540,19 +543,21 @@ function gc_use_free_list_entry( unsafe_store!(entry_ptr, entry_data.next) end - # At this point, all we need to do is update the allocation record to - # reflect the fact that it now represents an allocated block instead of - # a free block. + return entry +end + +# Prepends a free list record to a free list. +function gc_add_to_free_list( + entry::Ptr{FreeListRecord}, + list_ptr::Ptr{Ptr{FreeListRecord}}) # Set the `next` pointer to the value stored at the allocation list pointer. unsafe_store!( @get_field_pointer(entry, :next)::Ptr{Ptr{FreeListRecord}}, - unsafe_load(allocation_list_ptr)) + unsafe_load(list_ptr)) # Update the allocation list pointer to point to the entry. - unsafe_store!(allocation_list_ptr, entry) - - return data_address + unsafe_store!(list_ptr, entry) end # Tries to allocate a chunk of memory from a ScatterAlloc page. @@ -733,24 +738,22 @@ end # Tries to allocate a chunk of memory from a free list. # Returns a null pointer if no sufficiently large chunk of # memory can be found. -# -# `free_list_ptr` is a pointer to the head of the free list. -# `allocation_list_ptr` is a pointer to the head of the allocation list. -# -# This function is not thread-safe. -function gc_malloc_from_free_list( +# If the result is non-null, then a free list record is +# returned that has been taken from the free list but not +# yet added to another list. +function gc_take_any_list_entry( free_list_ptr::Ptr{Ptr{FreeListRecord}}, - allocation_list_ptr::Ptr{Ptr{FreeListRecord}}, - bytesize::Csize_t)::Ptr{UInt8} + bytesize::Csize_t)::Ptr{FreeListRecord} + # To allocate memory, we will walk the free list until we find a suitable candidate. - while free_list_ptr != C_NULL + while true free_list_item = unsafe_load(free_list_ptr) if free_list_item == C_NULL break end - result = gc_use_free_list_entry(free_list_ptr, allocation_list_ptr, free_list_item, bytesize) + result = gc_take_list_entry(free_list_ptr, free_list_item, bytesize) if result != C_NULL return result end @@ -768,7 +771,20 @@ end function gc_malloc_from_free_list(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8} free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{FreeListRecord}} allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{FreeListRecord}} - gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize) + + # Try to take the entry out of the free list. + result_entry = gc_take_any_list_entry(free_list_ptr, bytesize) + if result_entry == C_NULL + # The entry is just too small. Return a `null` pointer. + return C_NULL + end + + # At this point, all we need to do is update the allocation record to + # reflect the fact that it now represents an allocated block instead of + # a free block. + gc_add_to_free_list(result_entry, allocation_list_ptr) + + return data_pointer(result_entry) end # Writes a pointer to a temporary GC frame. This will keep the pointer @@ -784,9 +800,9 @@ end # Tries to allocate a chunk of memory in a particular GC arena. # Returns a null pointer if no sufficiently large chunk of # memory can be found. -function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8} +function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t; acquire_lock=true)::Ptr{UInt8} # Acquire the arena's lock. - result_ptr = writer_locked(get_lock(arena)) do + result_ptr = writer_locked(get_lock(arena); acquire_lock=acquire_lock) do # Allocate a suitable region of memory. gc_malloc_from_free_list(arena, bytesize) end @@ -858,7 +874,7 @@ end # Tries to allocate a chunk of memory in a particular GC arena. # Returns a null pointer if no sufficiently large chunk of # memory can be found. -function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{UInt8} +function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t; acquire_lock=true)::Ptr{UInt8} # The bodega arena might be empty (or approximately empty). If so, then we'll # just return null early. There's no need to scrape the bottom of the barrel. if !unsafe_load(@get_field_pointer(arena, :can_restock)) @@ -877,14 +893,14 @@ function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{UInt8} # Acquire a reader lock on the arena and try to take a chunk # from the shelf. lock = get_lock(free_list) - result_ptr = reader_locked(lock) do + result_ptr = reader_locked(lock; acquire_lock=acquire_lock) do gc_malloc_from_shelf(shelf) end if result_ptr == C_NULL # Looks like we need to re-stock the shelf. While we're at it, # we might as well grab a chunk of memory for ourselves. - result_ptr = writer_locked(lock) do + result_ptr = writer_locked(lock; acquire_lock=acquire_lock) do restock_shelf(arena, shelf) gc_malloc_from_free_list(free_list, bytesize) end @@ -894,6 +910,48 @@ function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{UInt8} return result_ptr end +# Transfers a block of free memory from one arena to another and then +# allocates a differently-sized block of memory from the destination +# arena. +function gc_transfer_and_malloc( + from_arena::Ptr{FreeListArena}, + to_arena::Ptr{FreeListArena}, + transfer_bytesize::Csize_t, + alloc_bytesize::Csize_t)::Ptr{UInt8} + + from_free_list = @get_field_pointer(from_arena, :free_list_head)::Ptr{Ptr{FreeListRecord}} + entry = writer_locked(get_lock(from_arena)) do + # Try to take the entry out of the free list. + gc_take_any_list_entry(from_free_list, transfer_bytesize) + end + + if entry == C_NULL + return C_NULL + else + to_free_list = @get_field_pointer(to_arena, :free_list_head)::Ptr{Ptr{FreeListRecord}} + return writer_locked(get_lock(to_arena)) do + gc_add_to_free_list(entry, to_free_list) + gc_malloc_local(to_arena, alloc_bytesize; acquire_lock=false) + end + end +end + +# Transfers a block of free memory from one arena to another and then +# allocates a differently-sized block of memory from the destination +# arena. +function gc_transfer_and_malloc( + from_arena::Ptr{FreeListArena}, + to_arena::Ptr{BodegaArena}, + transfer_bytesize::Csize_t, + alloc_bytesize::Csize_t)::Ptr{UInt8} + + gc_transfer_and_malloc( + from_arena, + get_free_list(to_arena), + transfer_bytesize, + alloc_bytesize) +end + """ gc_malloc(bytesize::Csize_t)::Ptr{UInt8} @@ -924,16 +982,29 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} if local_ptr != C_NULL return local_ptr end + else + # If there is no local arena then we will just have to allocate + # from the global arena directly. + return gc_malloc_local(master_record.global_arena, bytesize) end # Try to use the global arena if all else fails, but only if the chunk # of memory we want to allocate is sufficiently large. Allocating lots of # small chunks in the global arena will result in undue contention and slow # down kernels dramatically. - if bytesize >= 1024 + # + # If we need to allocate a small chunk of memory but the local arena is + # empty, then we will transfer a *much* larger chunk of memory from the global + # arena to the local arena. After that we'll allocate in the local arena. + min_global_alloc_size = Csize_t(256 * (1 << 10)) + if bytesize >= min_global_alloc_size local_ptr = gc_malloc_local(master_record.global_arena, bytesize) else - local_ptr = C_NULL + local_ptr = gc_transfer_and_malloc( + master_record.global_arena, + local_arena, + min_global_alloc_size, + bytesize) end return local_ptr end @@ -1042,8 +1113,8 @@ end # One megabyte. const MiB = 1 << 20 -# The initial size of the GC heap, currently 20 MiB. -const initial_gc_heap_size = 20 * MiB +# The initial size of the GC heap, currently 16 MiB. +const initial_gc_heap_size = 16 * MiB # The default capacity of a root buffer, i.e., the max number of # roots that can be stored per thread. Currently set to @@ -1064,7 +1135,7 @@ const global_arena_starvation_threshold = 4 * MiB # threshold after a collection phase, the collector will allocate # additional memory to the arena such that it is no longer starving. # The arena starvation threshold is currently set to 2 MiB. -const local_arena_starvation_threshold = 2 * MiB +const local_arena_starvation_threshold = 1 * MiB # The point at which a tiny arena is deemed to be starving, i.e., # it no longer contains enough memory to perform basic allocations. From d60114b053f205e8026b1fc759cb10f63268b00d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 3 Apr 2019 13:11:54 +0200 Subject: [PATCH 071/146] Update examples --- examples/linked-list.jl | 4 ++-- examples/matrix.jl | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/linked-list.jl b/examples/linked-list.jl index b0eb958a..2c7e949c 100644 --- a/examples/linked-list.jl +++ b/examples/linked-list.jl @@ -45,8 +45,8 @@ function sum(list::List{T}) where T reduce(+, list; init=zero(T)) end -const element_count = 200 -const thread_count = 256 +const element_count = 1000 +const thread_count = 32 function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64}) i = (blockIdx().x-1) * blockDim().x + threadIdx().x diff --git a/examples/matrix.jl b/examples/matrix.jl index a787171f..277aacd1 100644 --- a/examples/matrix.jl +++ b/examples/matrix.jl @@ -5,7 +5,6 @@ using StaticArrays, CUDAnative, CUDAdrv import Base: getindex, setindex!, pointer, unsafe_convert, zeros -using InteractiveUtils const use_gc = true From 82208c87ff94f0f1910dad24a46c67a6ad283f4e Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 3 Apr 2019 13:13:01 +0200 Subject: [PATCH 072/146] Introduce benchmarking utilities --- examples/matrix-static-arrays.jl | 24 +++++-------- examples/utils.jl | 60 ++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 15 deletions(-) create mode 100644 examples/utils.jl diff --git a/examples/matrix-static-arrays.jl b/examples/matrix-static-arrays.jl index 12e54e98..5e174bf5 100644 --- a/examples/matrix-static-arrays.jl +++ b/examples/matrix-static-arrays.jl @@ -1,9 +1,6 @@ -using StaticArrays, CUDAnative, CUDAdrv - -use_gc = false +using StaticArrays, CUDAnative, CUDAdrv, BenchmarkTools const matrix_dim = 40 -const iterations = 20 const thread_count = 256 function fill() @@ -20,21 +17,18 @@ end function kernel(result::CUDAnative.DevicePtr{Int64}) thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - # Write the accumulator to the result array. unsafe_store!(result, fill()[20, 30], thread_id) - return end -destination_array = Mem.alloc(Int64, thread_count) -destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) +include("utils.jl") -if use_gc - @cuda_gc threads=thread_count kernel(destination_pointer) - stats = @cuda_gc threads=thread_count kernel(destination_pointer) -else - @cuda threads=thread_count kernel(destination_pointer) - stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer) +function benchmark() + destination_array = Mem.alloc(Int64, thread_count) + destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + @cuda_sync threads=thread_count kernel(destination_pointer) end + +stats = @cuda_benchmark benchmark() +println(length(stats)) println(stats) diff --git a/examples/utils.jl b/examples/utils.jl new file mode 100644 index 00000000..253b8622 --- /dev/null +++ b/examples/utils.jl @@ -0,0 +1,60 @@ +use_gc = true + +""" + device_reset!(dev::CuDevice=device()) + +Reset the CUDA state associated with a device. This call with release the underlying +context, at which point any objects allocated in that context will be invalidated. +""" +function device_reset!(dev::CuDevice=CUDAdrv.device()) + if haskey(CUDAnative.device_contexts, dev) + # take the context out of the pool, and finalize it to trigger release + old_ctx = CUDAnative.device_contexts[dev] + delete!(CUDAnative.device_contexts, dev) + finalize(old_ctx) + + # unless the user switches devices, new API calls should trigger initialization + CUDAdrv.apicall_hook[] = CUDAnative.maybe_initialize + CUDAnative.initialized[] = false + # HACK: the context changes, but CuCurrentContext() _can_ actually return a handle + # with the same pointer value... this bypasses the compile cache, and crashes + empty!(CUDAnative.compilecache) + end +end + +""" + @sync ex +Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly +synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As +such, this operation is preferred over implicit synchronization (e.g. when performing a +memory copy) for high-performance applications. +It is also useful for timing code that executes asynchronously. +""" +macro sync(ex) + # Copied from https://github.com/JuliaGPU/CuArrays.jl/blob/8e45a27f2b12796f47683340845f98f017865676/src/utils.jl#L68-L86 + quote + local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING) + local ret = $(esc(ex)) + CUDAdrv.record(e) + CUDAdrv.synchronize(e) + ret + end +end + +macro cuda_sync(args...) + if use_gc + esc(quote + CUDAnative.@cuda_gc $(args...) + end) + else + esc(quote + @sync CUDAnative.@cuda $(args...) + end) + end +end + +macro cuda_benchmark(ex) + esc(quote + @benchmark $(ex) teardown=(device_reset!()) evals=1 + end) +end From b18a0bb49a2c101ade25d1bc32efa84d9bf463bf Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 3 Apr 2019 13:51:10 +0200 Subject: [PATCH 073/146] Fix some typos --- src/CUDAnative.jl | 1 - src/compiler/optim.jl | 2 +- src/execution.jl | 4 ++-- src/gc.jl | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index 8006ac8e..0a040a87 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -30,7 +30,6 @@ include(joinpath("device", "pointer.jl")) include(joinpath("device", "array.jl")) include(joinpath("device", "cuda.jl")) include(joinpath("device", "llvm.jl")) -include(joinpath("device", "runtime.jl")) include(joinpath("device", "threading.jl")) # The interrupts and GC files need to be loaded _before_ the diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index cac90195..cf5e8da3 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -70,7 +70,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int ModulePassManager() do pm initialize!(pm) - if ctx.gc + if job.gc add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry))) add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!)) else diff --git a/src/execution.jl b/src/execution.jl index ecfbefe5..f8b902e6 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -450,7 +450,7 @@ functionality is included in [`@cuda`](@ref). The 'init' keyword argument is a function that takes a kernel as argument and sets up an environment for the kernel. """ -function prepare_kernel(kernel::Kernel{F,TT}; init::Function=nop_init_kernel) where {F,TT} +function prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel) where {F,TT} # Just call the 'init' function for now. init(kernel) end @@ -514,7 +514,7 @@ function nearest_warpsize(dev::CuDevice, threads::Integer) return threads + (ws - threads % ws) % ws end -function nop_init_kernel(kernel::Kernel{F,TT}) where {F,TT} +function nop_init_kernel(kernel::AbstractKernel{F,TT}) where {F,TT} # Do nothing. return end \ No newline at end of file diff --git a/src/gc.jl b/src/gc.jl index 4df493a6..9da1ae5b 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -1795,7 +1795,7 @@ macro cuda_gc(ex...) args = call.args[2:end] code = quote end - compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs) + env_kwargs, compiler_kwargs, call_kwargs = CUDAnative.split_kwargs(kwargs) vars, var_exprs = CUDAnative.assign_args!(code, args) # Find the stream on which the kernel is to be scheduled. From 11042c4f248c1edf945ebb83dcf4642da18ea395 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 3 Apr 2019 14:27:31 +0200 Subject: [PATCH 074/146] Update '@cuda_interruptible' --- src/interrupts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/interrupts.jl b/src/interrupts.jl index 83fe13d5..de7cc7cb 100644 --- a/src/interrupts.jl +++ b/src/interrupts.jl @@ -237,7 +237,7 @@ macro cuda_interruptible(handler, ex...) args = call.args[2:end] code = quote end - compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs) + env_kwargs, compiler_kwargs, call_kwargs = CUDAnative.split_kwargs(kwargs) vars, var_exprs = CUDAnative.assign_args!(code, args) # Find the stream on which the kernel is to be scheduled. From b183aafce8fce2ca5a59901a6344472eb3082259 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 3 Apr 2019 14:29:17 +0200 Subject: [PATCH 075/146] Don't try to include deleted intrinsics test file --- test/runtests.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 6cac0eb5..11738b64 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -69,7 +69,6 @@ if CUDAnative.configured include("device/pointer.jl") include("device/array.jl") include("device/cuda.jl") - include("device/intrinsics.jl") include("device/threading.jl") include("device/gc.jl") From 0402ad870482fade6d1251c882bc7c631723e95f Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 3 Apr 2019 15:25:12 +0200 Subject: [PATCH 076/146] Switch back to free lists for local arenas --- src/gc.jl | 51 ++++++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 9da1ae5b..e56bbbc8 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -303,6 +303,9 @@ const GCFrame = Ptr{ObjectRef} in_perma_safepoint = 2 end +const LocalArena = FreeListArena +const GlobalArena = FreeListArena + # A data structure that contains global GC info. This data # structure is designed to be immutable: it should not be changed # once the host has set it up. @@ -325,10 +328,10 @@ struct GCMasterRecord tiny_arena::Ptr{ScatterAllocArena} # A pointer to a list of local GC arena pointers. - local_arenas::Ptr{Ptr{BodegaArena}} + local_arenas::Ptr{Ptr{LocalArena}} # A pointer to the global GC arena. - global_arena::Ptr{FreeListArena} + global_arena::Ptr{GlobalArena} # A pointer to a list of safepoint flags. Every warp has its # own flag. @@ -377,10 +380,10 @@ end # Gets a pointer to the local arena for this thread. This # pointer may be null if there are no local arenas. -@inline function get_local_arena()::Ptr{BodegaArena} +@inline function get_local_arena()::Ptr{LocalArena} master_record = get_gc_master_record() if master_record.local_arena_count == UInt32(0) - return Base.unsafe_convert(Ptr{BodegaArena}, C_NULL) + return Base.unsafe_convert(Ptr{LocalArena}, C_NULL) else return unsafe_load( master_record.local_arenas, @@ -1183,8 +1186,8 @@ function gc_init!( gc_memory_end_ptr = master_region.start + master_region.size # Allocate a local arena pointer buffer. - local_arenas_bytesize = sizeof(Ptr{BodegaArena}) * local_arena_count - local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{BodegaArena}}, gc_memory_start_ptr) + local_arenas_bytesize = sizeof(Ptr{LocalArena}) * local_arena_count + local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{LocalArena}}, gc_memory_start_ptr) # Allocate the safepoint flag buffer. safepoint_bytesize = sizeof(SafepointState) * warp_count @@ -1214,13 +1217,13 @@ function gc_init!( # Set up local arenas. for i in 1:local_arena_count - local_arena = make_gc_arena!(BodegaArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold)) + local_arena = make_gc_arena!(LocalArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold)) unsafe_store!(local_arenas_ptr, local_arena, i) arena_start_ptr += local_arena_starvation_threshold end # Set up the global arena. - global_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr)) + global_arena = make_gc_arena!(GlobalArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr)) return GCMasterRecord( warp_count, @@ -1390,42 +1393,36 @@ function get_record( alloc_list::SortedAllocationList, pointer::Ptr{T})::Ptr{FreeListRecord} where T - cast_ptr = Base.unsafe_convert(Ptr{FreeListRecord}, pointer) - - # Deal with the most common cases quickly. + # Deal with these cases quickly so we can assume that the + # free list is nonempty. if length(alloc_list) == 0 || pointer < data_pointer(alloc_list.records[1]) || - pointer > data_pointer(alloc_list.records[end]) + Base.unsafe_load(alloc_list.records[end]).size + pointer >= data_end_pointer(alloc_list.records[end]) return C_NULL end - # To do this lookup quickly, we will do a binary search for the - # biggest allocation record pointer that is smaller than `pointer`. + # To quickly narrow down the search space, we will do a binary search + # for the biggest allocation record pointer that is smaller than `pointer`. range_start, range_end = 1, length(alloc_list) - while range_end - range_start > 1 + while range_end - range_start > 4 range_mid = div(range_start + range_end, 2) mid_val = alloc_list.records[range_mid] - if mid_val > cast_ptr + if mid_val > pointer range_end = range_mid else range_start = range_mid end end - record = alloc_list.records[range_end] - if record >= cast_ptr - record = alloc_list.records[range_start] - end - # Make sure that the pointer actually points to a region of memory # that is managed by the candidate record we found. - record_data_pointer = data_pointer(record) - if cast_ptr >= record_data_pointer && cast_ptr < record_data_pointer + unsafe_load(record).size - return record - else - return C_NULL + for record in alloc_list.records[range_start:range_end] + if pointer >= data_pointer(record) && pointer < data_end_pointer(record) + return record + end end + return C_NULL end # Iterates through a linked list of allocation records and apply a function @@ -1561,7 +1558,7 @@ function gc_compact(arena::Ptr{FreeListArena})::Csize_t unsafe_store!(prev_record_ptr, C_NULL) # Compute the total number of free bytes. - return sum(record -> unsafe_load(record).size, records) + return sum(map(record -> unsafe_load(record).size, records)) end # Compact a GC arena's free list. This function will From b9029da15395aad3a517f3b661740cecbe176e80 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 3 Apr 2019 16:20:16 +0200 Subject: [PATCH 077/146] Fix GC collection bug --- src/gc.jl | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index e56bbbc8..0993fd5b 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -1085,8 +1085,7 @@ function gc_free_local( free_list_head_ptr = @get_field_pointer(arena, :free_list_head) # Remove the record from the allocation list. - next_record = unsafe_load(next_record_ptr) - unsafe_store!(record_ptr, next_record) + unsafe_store!(record_ptr, unsafe_load(next_record_ptr)) # Add the record to the free list and update its `next` pointer # (but not in that order). @@ -1137,7 +1136,7 @@ const global_arena_starvation_threshold = 4 * MiB # If a local arena's free byte count stays below the arena starvation # threshold after a collection phase, the collector will allocate # additional memory to the arena such that it is no longer starving. -# The arena starvation threshold is currently set to 2 MiB. +# The arena starvation threshold is currently set to 1 MiB. const local_arena_starvation_threshold = 1 * MiB # The point at which a tiny arena is deemed to be starving, i.e., @@ -1704,10 +1703,9 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, # Mark the block as live. push!(live_blocks, record) # Add all pointer-sized, aligned values to the live pointer worklist. - block_pointer = data_pointer(record) - block_size = unsafe_load(record).size - for i in 0:sizeof(ObjectRef):(block_size - 1) - push!(live_worklist, Base.unsafe_convert(ObjectRef, block_pointer + i)) + for ptr in data_pointer(record):sizeof(ObjectRef):data_end_pointer(record) - 1 + value = unsafe_load(Base.unsafe_convert(Ptr{ObjectRef}, ptr)) + push!(live_worklist, value) end end end From 7e20c667bfb3dd87b3707f111f0520aef98f131b Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 3 Apr 2019 16:27:49 +0200 Subject: [PATCH 078/146] Reduce initial GC heap size --- src/gc.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 0993fd5b..82f5c0e2 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -1115,8 +1115,8 @@ end # One megabyte. const MiB = 1 << 20 -# The initial size of the GC heap, currently 16 MiB. -const initial_gc_heap_size = 16 * MiB +# The initial size of the GC heap, currently 10 MiB. +const initial_gc_heap_size = 10 * MiB # The default capacity of a root buffer, i.e., the max number of # roots that can be stored per thread. Currently set to From 51ca870b1f2a1ac18e00417d41552e4a296719eb Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 3 Apr 2019 16:31:54 +0200 Subject: [PATCH 079/146] Update benchmarking utilities --- examples/utils.jl | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/examples/utils.jl b/examples/utils.jl index 253b8622..f8720a82 100644 --- a/examples/utils.jl +++ b/examples/utils.jl @@ -7,19 +7,17 @@ Reset the CUDA state associated with a device. This call with release the underl context, at which point any objects allocated in that context will be invalidated. """ function device_reset!(dev::CuDevice=CUDAdrv.device()) - if haskey(CUDAnative.device_contexts, dev) - # take the context out of the pool, and finalize it to trigger release - old_ctx = CUDAnative.device_contexts[dev] - delete!(CUDAnative.device_contexts, dev) - finalize(old_ctx) - - # unless the user switches devices, new API calls should trigger initialization - CUDAdrv.apicall_hook[] = CUDAnative.maybe_initialize - CUDAnative.initialized[] = false - # HACK: the context changes, but CuCurrentContext() _can_ actually return a handle - # with the same pointer value... this bypasses the compile cache, and crashes - empty!(CUDAnative.compilecache) - end + delete!(CUDAnative.device_contexts, dev) + + pctx = CuPrimaryContext(dev) + unsafe_reset!(pctx) + + # unless the user switches devices, new API calls should trigger initialization + CUDAdrv.apicall_hook[] = CUDAnative.maybe_initialize + CUDAnative.initialized[] = false + + # HACK: primary contexts always have the same handle, defeating the compilation cache + empty!(CUDAnative.compilecache) end """ @@ -55,6 +53,6 @@ end macro cuda_benchmark(ex) esc(quote - @benchmark $(ex) teardown=(device_reset!()) evals=1 + @benchmark $(ex) setup=($(ex)) teardown=(device_reset!()) evals=1 end) end From 4390d90556bad854820471c73522f9ad8048c4b1 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 3 Apr 2019 16:46:20 +0200 Subject: [PATCH 080/146] Put GC benchmarks in a separate directory --- gc-benchmarks/binary-tree.jl | 165 ++++++++++++++++++ gc-benchmarks/linked-list.jl | 81 +++++++++ .../matrix-static-arrays.jl | 14 +- {examples => gc-benchmarks}/utils.jl | 6 +- 4 files changed, 259 insertions(+), 7 deletions(-) create mode 100644 gc-benchmarks/binary-tree.jl create mode 100644 gc-benchmarks/linked-list.jl rename {examples => gc-benchmarks}/matrix-static-arrays.jl (74%) rename {examples => gc-benchmarks}/utils.jl (90%) diff --git a/gc-benchmarks/binary-tree.jl b/gc-benchmarks/binary-tree.jl new file mode 100644 index 00000000..b5a76629 --- /dev/null +++ b/gc-benchmarks/binary-tree.jl @@ -0,0 +1,165 @@ +using CUDAdrv, CUDAnative +using Random, Test +import Base: haskey, insert! + +include("utils.jl") + +# This benchmark defines a kernel that constructs a binary search +# tree for a set of numbers and then proceeds to test membership +# in that tree for a sequence of other numbers. +# +# The benchmark is designed to stress the allocator's ability to +# allocate many small objects and garbage-collect the ones that +# become dead after a while. + +"""A binary search tree node.""" +abstract type BinarySearchTreeNode{T} end + +"""An internal node of a binary search tree.""" +mutable struct InternalNode{T} <: BinarySearchTreeNode{T} + value::T + left::BinarySearchTreeNode{T} + right::BinarySearchTreeNode{T} +end + +InternalNode{T}(value::T) where T = InternalNode{T}(value, LeafNode{T}(), LeafNode{T}()) + +"""A leaf node of a binary search tree.""" +mutable struct LeafNode{T} <: BinarySearchTreeNode{T} end + +"""A binary search tree data structure.""" +mutable struct BinarySearchTree{T} + root::BinarySearchTreeNode{T} +end + +"""Creates an empty binary search tree.""" +BinarySearchTree{T}() where T = BinarySearchTree{T}(LeafNode{T}()) + +"""Tells if a binary search tree contains a particular element.""" +function haskey(tree::BinarySearchTree{T}, value::T)::Bool where T + walk = tree.root + while isa(walk, InternalNode{T}) + if walk.value == value + return true + elseif walk.value > value + walk = walk.right + else + walk = walk.left + end + end + return false +end + +"""Inserts an element into a binary search tree.""" +function insert!(tree::BinarySearchTree{T}, value::T) where T + if !isa(tree.root, InternalNode{T}) + tree.root = InternalNode{T}(value) + return + end + + walk = tree.root::InternalNode{T} + while true + if walk.value == value + return + elseif walk.value > value + right = walk.right + if isa(right, InternalNode{T}) + walk = right + else + walk.right = InternalNode{T}(value) + return + end + else + left = walk.left + if isa(left, InternalNode{T}) + walk = left + else + walk.left = InternalNode{T}(value) + return + end + end + end +end + +""" +Creates a binary search tree that contains elements copied from a device array. +""" +function BinarySearchTree{T}(elements::CUDAnative.DevicePtr{T}, size::Integer) where T + tree = BinarySearchTree{T}() + for i in 1:size + insert!(tree, unsafe_load(elements, i)) + end + tree +end + +""" +Creates a binary search tree that contains elements copied from an array. +""" +function BinarySearchTree{T}(elements::Array{T}) where T + tree = BinarySearchTree{T}() + for i in 1:length(elements) + insert!(tree, elements[i]) + end + tree +end + +# Gets a sequence of Fibonacci numbers. +function fibonacci(::Type{T}, count::Integer)::Array{T} where T + if count == 0 + return [] + elseif count == 1 + return [one(T)] + end + + results = [one(T), one(T)] + for i in 1:(count - 2) + push!(results, results[length(results) - 1] + results[length(results)]) + end + return results +end + +const number_count = 200 +const thread_count = 64 +const tests_per_thread = 2000 + +# Define a kernel that copies values using a temporary buffer. +function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + tree = BinarySearchTree{Int64}(a, number_count) + + for j in 1:tests_per_thread + offset = (i - 1) * tests_per_thread + index = offset + j + unsafe_store!(b, haskey(tree, unsafe_load(b, index)), index) + end + + return +end + +function benchmark() + # Generate a sequence of 64-bit truncated Fibonacci numbers. + number_set = fibonacci(Int64, number_count) + # Randomize the sequence's order. + shuffle!(number_set) + + # Generate numbers for which we will test membership in the sequence. + test_sequence = Array(1:(thread_count * tests_per_thread)) + + # Allocate two arrays. + source_array = Mem.alloc(Int64, length(number_set)) + destination_array = Mem.alloc(Int64, length(test_sequence)) + source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array) + destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + + # Fill the source and destination arrays. + Mem.upload!(source_array, number_set) + Mem.upload!(destination_array, test_sequence) + + # Run the kernel. + @cuda_sync threads=thread_count kernel(source_pointer, destination_pointer) + + @test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence]) +end + +@cuda_benchmark benchmark() diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl new file mode 100644 index 00000000..a8e0c616 --- /dev/null +++ b/gc-benchmarks/linked-list.jl @@ -0,0 +1,81 @@ +using CUDAnative, CUDAdrv, BenchmarkTools +using Test + +include("utils.jl") + +import Base: foldl, reduce, sum + +# This benchmark constructs a linked list in a GPU kernel. +# In doing so, it stresses the allocator's ability to quickly +# allocate many small objects, as is common in idiomatic +# object-oriented programs. +# Thread divergence should be minimal in this benchmark. + +abstract type List{T} +end + +mutable struct Nil{T} <: List{T} +end + +mutable struct Cons{T} <: List{T} + value::T + next::List{T} +end + +Cons{T}(value::T) where T = Cons{T}(value, Nil{T}()) + +function List{T}(pointer, count::Integer) where T + result = Nil{T}() + for i in count:-1:1 + result = Cons{T}(unsafe_load(pointer, i), result) + end + result +end + +function foldl(op, list::List{T}; init) where T + node = list + accumulator = init + while isa(node, Cons{T}) + accumulator = op(accumulator, node.value) + node = node.next + end + accumulator +end + +function reduce(op, list::List{T}; init) where T + foldl(op, list; init=init) +end + +function sum(list::List{T}) where T + reduce(+, list; init=zero(T)) +end + +const element_count = 1000 +const thread_count = 32 + +function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64}) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + l = List{Int64}(elements, element_count) + unsafe_store!(results, sum(l), i) + return +end + +function benchmark() + # Allocate two arrays. + source_array = Mem.alloc(Int64, element_count) + destination_array = Mem.alloc(Int64, thread_count) + source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array) + destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) + + # Fill the source and destination arrays. + Mem.upload!(source_array, Array(1:element_count)) + Mem.upload!(destination_array, zeros(Int64, thread_count)) + + # Run the kernel. + @cuda_sync threads=thread_count kernel(source_pointer, destination_pointer) + + # Verify the kernel's output. + @test Mem.download(Int64, destination_array, thread_count) == repeat([sum(1:element_count)], thread_count) +end + +@cuda_benchmark benchmark() diff --git a/examples/matrix-static-arrays.jl b/gc-benchmarks/matrix-static-arrays.jl similarity index 74% rename from examples/matrix-static-arrays.jl rename to gc-benchmarks/matrix-static-arrays.jl index 5e174bf5..43002099 100644 --- a/examples/matrix-static-arrays.jl +++ b/gc-benchmarks/matrix-static-arrays.jl @@ -1,4 +1,10 @@ -using StaticArrays, CUDAnative, CUDAdrv, BenchmarkTools +using StaticArrays, CUDAnative, CUDAdrv + +include("utils.jl") + +# This benchmark makes every thread allocate a large matrix. +# It stresses the allocator's ability to quickly allocate +# a small number of very large objects. const matrix_dim = 40 const thread_count = 256 @@ -21,14 +27,10 @@ function kernel(result::CUDAnative.DevicePtr{Int64}) return end -include("utils.jl") - function benchmark() destination_array = Mem.alloc(Int64, thread_count) destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) @cuda_sync threads=thread_count kernel(destination_pointer) end -stats = @cuda_benchmark benchmark() -println(length(stats)) -println(stats) +@cuda_benchmark benchmark() diff --git a/examples/utils.jl b/gc-benchmarks/utils.jl similarity index 90% rename from examples/utils.jl rename to gc-benchmarks/utils.jl index f8720a82..dfb289a0 100644 --- a/examples/utils.jl +++ b/gc-benchmarks/utils.jl @@ -1,3 +1,5 @@ +import BenchmarkTools + use_gc = true """ @@ -53,6 +55,8 @@ end macro cuda_benchmark(ex) esc(quote - @benchmark $(ex) setup=($(ex)) teardown=(device_reset!()) evals=1 + local stats = BenchmarkTools.@benchmark $(ex) setup=($(ex)) teardown=(device_reset!()) evals=1 + println(length(stats)) + println(stats) end) end From a3fb2903976f5f237d47fc1d87712b759163833c Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 4 Apr 2019 13:26:45 +0200 Subject: [PATCH 081/146] Rename linked list benchmark import --- gc-benchmarks/linked-list.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl index a8e0c616..ae8e3fcc 100644 --- a/gc-benchmarks/linked-list.jl +++ b/gc-benchmarks/linked-list.jl @@ -1,4 +1,4 @@ -using CUDAnative, CUDAdrv, BenchmarkTools +using CUDAnative, CUDAdrv using Test include("utils.jl") From 79ea2a1df301d218c44255cf00473c3c2814805d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 4 Apr 2019 13:27:25 +0200 Subject: [PATCH 082/146] Rename matrix GC benchmark --- gc-benchmarks/{matrix-static-arrays.jl => matrix.jl} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename gc-benchmarks/{matrix-static-arrays.jl => matrix.jl} (100%) diff --git a/gc-benchmarks/matrix-static-arrays.jl b/gc-benchmarks/matrix.jl similarity index 100% rename from gc-benchmarks/matrix-static-arrays.jl rename to gc-benchmarks/matrix.jl From f2dbf3f3ed548749f4db45a86c8f047b4f5a4f5a Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 4 Apr 2019 14:22:42 +0200 Subject: [PATCH 083/146] Set the malloc heap size when running benchmarks --- gc-benchmarks/utils.jl | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index dfb289a0..868ccbb7 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -2,6 +2,18 @@ import BenchmarkTools use_gc = true +const MiB = 1 << 20 +const CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 +const BENCHMARK_HEAP_SIZE = 64 * MiB + +function set_malloc_heap_size(size::Integer) + CUDAdrv.@apicall( + :cuCtxSetLimit, + (Cint, Csize_t), + CU_LIMIT_MALLOC_HEAP_SIZE, + Csize_t(size)) +end + """ device_reset!(dev::CuDevice=device()) @@ -55,7 +67,7 @@ end macro cuda_benchmark(ex) esc(quote - local stats = BenchmarkTools.@benchmark $(ex) setup=($(ex)) teardown=(device_reset!()) evals=1 + local stats = BenchmarkTools.@benchmark $(ex) setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $(ex)) teardown=(device_reset!()) evals=1 println(length(stats)) println(stats) end) From b57b9022a1cf1dd2fbc96fd9bd2dffd591587ef9 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 4 Apr 2019 14:22:54 +0200 Subject: [PATCH 084/146] Add an array benchmark --- gc-benchmarks/arrays.jl | 51 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 gc-benchmarks/arrays.jl diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl new file mode 100644 index 00000000..160824ce --- /dev/null +++ b/gc-benchmarks/arrays.jl @@ -0,0 +1,51 @@ +using CUDAdrv, CUDAnative, StaticArrays + +include("utils.jl") + +# This benchmark allocates a variety of differently-sized arrays. +# The point of this benchmark is to ascertain how well the GC handles +# many differently-sized objects. + +const thread_count = 64 + +@noinline function escape(value) + Base.pointer_from_objref(value) + value +end + +macro new_array(T, size) + quote + escape(zeros(MArray{Tuple{$size}, $T})) + end +end + +function kernel() + for i in 1:2 + for j in 1:2 + for k in 1:2 + for l in 1:2 + @new_array(Int64, 4) + @new_array(Int64, 8) + @new_array(Int64, 16) + end + @new_array(Int64, 32) + @new_array(Int64, 64) + @new_array(Int64, 128) + end + @new_array(Int64, 256) + @new_array(Int64, 512) + @new_array(Int64, 1024) + end + @new_array(Int64, 2048) + @new_array(Int64, 4096) + @new_array(Int64, 8192) + end + return +end + +function benchmark() + # Run the kernel. + @cuda_sync threads=thread_count kernel() +end + +@cuda_benchmark benchmark() From 89d8bbc96b74bc596655879524c4550ea51abe71 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 4 Apr 2019 14:28:23 +0200 Subject: [PATCH 085/146] Reuse 'device_reset!' in benchmarking utils --- gc-benchmarks/utils.jl | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index 868ccbb7..5982e3c4 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -14,26 +14,6 @@ function set_malloc_heap_size(size::Integer) Csize_t(size)) end -""" - device_reset!(dev::CuDevice=device()) - -Reset the CUDA state associated with a device. This call with release the underlying -context, at which point any objects allocated in that context will be invalidated. -""" -function device_reset!(dev::CuDevice=CUDAdrv.device()) - delete!(CUDAnative.device_contexts, dev) - - pctx = CuPrimaryContext(dev) - unsafe_reset!(pctx) - - # unless the user switches devices, new API calls should trigger initialization - CUDAdrv.apicall_hook[] = CUDAnative.maybe_initialize - CUDAnative.initialized[] = false - - # HACK: primary contexts always have the same handle, defeating the compilation cache - empty!(CUDAnative.compilecache) -end - """ @sync ex Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly From fac07ef8b4b2ecea753e8a474af4daa1a81db679 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 9 Apr 2019 15:41:06 +0200 Subject: [PATCH 086/146] Create a GC benchmark driver --- gc-benchmarks/arrays.jl | 12 +++++----- gc-benchmarks/binary-tree.jl | 19 +++++++++------- gc-benchmarks/linked-list.jl | 23 ++++++++++--------- gc-benchmarks/matrix.jl | 27 ++++++++++++++-------- gc-benchmarks/run-all.jl | 10 +++++++++ gc-benchmarks/utils.jl | 43 ++++++++++++++++++++++++++---------- 6 files changed, 89 insertions(+), 45 deletions(-) create mode 100644 gc-benchmarks/run-all.jl diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl index 160824ce..49326cb6 100644 --- a/gc-benchmarks/arrays.jl +++ b/gc-benchmarks/arrays.jl @@ -1,6 +1,6 @@ -using CUDAdrv, CUDAnative, StaticArrays +module Arrays -include("utils.jl") +using CUDAdrv, CUDAnative, StaticArrays # This benchmark allocates a variety of differently-sized arrays. # The point of this benchmark is to ascertain how well the GC handles @@ -43,9 +43,11 @@ function kernel() return end -function benchmark() +end + +function arrays_benchmark() # Run the kernel. - @cuda_sync threads=thread_count kernel() + @cuda_sync threads=Arrays.thread_count Arrays.kernel() end -@cuda_benchmark benchmark() +@cuda_benchmark "arrays" arrays_benchmark() diff --git a/gc-benchmarks/binary-tree.jl b/gc-benchmarks/binary-tree.jl index b5a76629..99d98afc 100644 --- a/gc-benchmarks/binary-tree.jl +++ b/gc-benchmarks/binary-tree.jl @@ -1,8 +1,9 @@ -using CUDAdrv, CUDAnative using Random, Test -import Base: haskey, insert! -include("utils.jl") +module BinaryTree + +using CUDAdrv, CUDAnative +import Base: haskey, insert! # This benchmark defines a kernel that constructs a binary search # tree for a set of numbers and then proceeds to test membership @@ -137,14 +138,16 @@ function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64}) return end -function benchmark() +end + +function bintree_benchmark() # Generate a sequence of 64-bit truncated Fibonacci numbers. - number_set = fibonacci(Int64, number_count) + number_set = BinaryTree.fibonacci(Int64, BinaryTree.number_count) # Randomize the sequence's order. shuffle!(number_set) # Generate numbers for which we will test membership in the sequence. - test_sequence = Array(1:(thread_count * tests_per_thread)) + test_sequence = Array(1:(BinaryTree.thread_count * BinaryTree.tests_per_thread)) # Allocate two arrays. source_array = Mem.alloc(Int64, length(number_set)) @@ -157,9 +160,9 @@ function benchmark() Mem.upload!(destination_array, test_sequence) # Run the kernel. - @cuda_sync threads=thread_count kernel(source_pointer, destination_pointer) + @cuda_sync threads=BinaryTree.thread_count BinaryTree.kernel(source_pointer, destination_pointer) @test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence]) end -@cuda_benchmark benchmark() +@cuda_benchmark "binary-tree" bintree_benchmark() diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl index ae8e3fcc..84f76fc5 100644 --- a/gc-benchmarks/linked-list.jl +++ b/gc-benchmarks/linked-list.jl @@ -1,8 +1,7 @@ +module LinkedList + using CUDAnative, CUDAdrv using Test - -include("utils.jl") - import Base: foldl, reduce, sum # This benchmark constructs a linked list in a GPU kernel. @@ -60,22 +59,24 @@ function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.Devic return end -function benchmark() +end + +function linkedlist_benchmark() # Allocate two arrays. - source_array = Mem.alloc(Int64, element_count) - destination_array = Mem.alloc(Int64, thread_count) + source_array = Mem.alloc(Int64, LinkedList.element_count) + destination_array = Mem.alloc(Int64, LinkedList.thread_count) source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array) destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) # Fill the source and destination arrays. - Mem.upload!(source_array, Array(1:element_count)) - Mem.upload!(destination_array, zeros(Int64, thread_count)) + Mem.upload!(source_array, Array(1:LinkedList.element_count)) + Mem.upload!(destination_array, zeros(Int64, LinkedList.thread_count)) # Run the kernel. - @cuda_sync threads=thread_count kernel(source_pointer, destination_pointer) + @cuda_sync threads=LinkedList.thread_count LinkedList.kernel(source_pointer, destination_pointer) # Verify the kernel's output. - @test Mem.download(Int64, destination_array, thread_count) == repeat([sum(1:element_count)], thread_count) + @test Mem.download(Int64, destination_array, LinkedList.thread_count) == repeat([sum(1:LinkedList.element_count)], LinkedList.thread_count) end -@cuda_benchmark benchmark() +@cuda_benchmark "linked-list" linkedlist_benchmark() diff --git a/gc-benchmarks/matrix.jl b/gc-benchmarks/matrix.jl index 43002099..fa772e8e 100644 --- a/gc-benchmarks/matrix.jl +++ b/gc-benchmarks/matrix.jl @@ -1,14 +1,19 @@ -using StaticArrays, CUDAnative, CUDAdrv +module Matrix -include("utils.jl") +using StaticArrays, CUDAnative, CUDAdrv # This benchmark makes every thread allocate a large matrix. # It stresses the allocator's ability to quickly allocate -# a small number of very large objects. +# very large objects. const matrix_dim = 40 const thread_count = 256 +@noinline function escape(value) + Base.pointer_from_objref(value) + value +end + function fill() m = zeros(MMatrix{matrix_dim, matrix_dim, Int64}) @@ -18,19 +23,23 @@ function fill() end end - return m + return escape(m) end function kernel(result::CUDAnative.DevicePtr{Int64}) thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x - unsafe_store!(result, fill()[20, 30], thread_id) + for i in 1:6 + unsafe_store!(result, fill()[20, 30], thread_id) + end return end -function benchmark() - destination_array = Mem.alloc(Int64, thread_count) +end + +function matrix_benchmark() + destination_array = Mem.alloc(Int64, Matrix.thread_count) destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) - @cuda_sync threads=thread_count kernel(destination_pointer) + @cuda_sync threads=Matrix.thread_count Matrix.kernel(destination_pointer) end -@cuda_benchmark benchmark() +@cuda_benchmark "matrix" matrix_benchmark() diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl new file mode 100644 index 00000000..13050498 --- /dev/null +++ b/gc-benchmarks/run-all.jl @@ -0,0 +1,10 @@ +using CUDAdrv, CUDAnative + +include("utils.jl") + +include("arrays.jl") +include("binary-tree.jl") +include("linked-list.jl") +include("matrix.jl") + +println(run_benchmarks()) diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index 5982e3c4..e8d21900 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -1,6 +1,12 @@ import BenchmarkTools -use_gc = true +function should_use_gc() + try + return use_gc + catch ex + return true + end +end const MiB = 1 << 20 const CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 @@ -34,21 +40,34 @@ macro sync(ex) end macro cuda_sync(args...) - if use_gc - esc(quote + esc(quote + if should_use_gc() CUDAnative.@cuda_gc $(args...) - end) - else - esc(quote + else @sync CUDAnative.@cuda $(args...) - end) - end + end + end) end -macro cuda_benchmark(ex) +suite = BenchmarkTools.BenchmarkGroup() + +function register_cuda_benchmark(f, name) + suite[name] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 +end + +macro cuda_benchmark(name, ex) esc(quote - local stats = BenchmarkTools.@benchmark $(ex) setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $(ex)) teardown=(device_reset!()) evals=1 - println(length(stats)) - println(stats) + register_cuda_benchmark($name * "-gc") do + global use_gc = true + $(ex) + end + register_cuda_benchmark($name * "-nogc") do + global use_gc = false + $(ex) + end end) end + +function run_benchmarks() + BenchmarkTools.run(suite) +end From f8d4edeb28ab537ec78f60a4dd3cb27423b451d5 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 9 Apr 2019 16:23:52 +0200 Subject: [PATCH 087/146] Include an SSA IR optimization benchmark --- gc-benchmarks/run-all.jl | 1 + gc-benchmarks/ssa-opt.jl | 100 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 gc-benchmarks/ssa-opt.jl diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 13050498..256fbe38 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -6,5 +6,6 @@ include("arrays.jl") include("binary-tree.jl") include("linked-list.jl") include("matrix.jl") +include("ssa-opt.jl") println(run_benchmarks()) diff --git a/gc-benchmarks/ssa-opt.jl b/gc-benchmarks/ssa-opt.jl new file mode 100644 index 00000000..3f8c3a39 --- /dev/null +++ b/gc-benchmarks/ssa-opt.jl @@ -0,0 +1,100 @@ +# This benchmark defines a sea-of-nodes SSA IR, creates a basic +# block on the GPU and applies the constant folding optimization +# to it. + +module SSAOpt + +# A base type for SSA instructions. +abstract type Instruction end + +# A base type for values or flow in an SSA basic block. +abstract type ValueOrFlow end + +# A value in an SSA control-flow graph. +mutable struct Value <: ValueOrFlow + # The instruction that computes the value. + instruction::Instruction + + # The next value or control-flow instruction. + next::ValueOrFlow +end + +# A base type for control-flow instructions in an SSA basic block. +abstract type Flow <: ValueOrFlow end + +# A control-flow instruction that returns a value. +mutable struct ReturnFlow <: Flow + # The value to return. + result::Value +end + +# A control-flow instruction that represents undefined control flow. +mutable struct UndefinedFlow <: Flow end + +# A basic block in an SSA control-flow graph. +mutable struct BasicBlock + # The first value or flow instruction in the basic block. + head::ValueOrFlow +end + +# An integer constant instruction. +mutable struct IConst <: Instruction + value::Int +end + +# An integer addition instruction. +mutable struct IAdd <: Instruction + # The left value. + left::Value + # The right value. + right::Value +end + +# Folds constants in a basic block. +function fold_constants(block::BasicBlock) + value = block.head + while isa(value, Value) + insn = value.instruction + if isa(insn, IAdd) + left = insn.left.instruction + right = insn.right.instruction + if isa(left, IConst) + if isa(right, IConst) + value.instruction = IConst(left.value + right.value) + end + end + end + value = value.next + end + block +end + +# Creates a block that naively computes `sum(1:range_max)`. +function create_range_sum_block(range_max) + head = accumulator = Value(IConst(0), UndefinedFlow()) + for i in 1:range_max + constant = Value(IConst(i), UndefinedFlow()) + accumulator.next = constant + accumulator = Value(IAdd(accumulator, constant), UndefinedFlow()) + constant.next = accumulator + end + ret_flow = ReturnFlow(accumulator) + accumulator.next = ret_flow + BasicBlock(head) +end + +const thread_count = 256 + +function kernel() + block = create_range_sum_block(50) + fold_constants(block) + return +end + +end + +function ssaopt_benchmark() + @cuda_sync threads=SSAOpt.thread_count SSAOpt.kernel() +end + +@cuda_benchmark "ssa-opt" ssaopt_benchmark() From 67ac9de09a33e1f43984a01cfa55d87ea6c570a2 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 9 Apr 2019 16:34:37 +0200 Subject: [PATCH 088/146] Tweak ssa-opt benchmark comment --- gc-benchmarks/ssa-opt.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gc-benchmarks/ssa-opt.jl b/gc-benchmarks/ssa-opt.jl index 3f8c3a39..e499e543 100644 --- a/gc-benchmarks/ssa-opt.jl +++ b/gc-benchmarks/ssa-opt.jl @@ -1,4 +1,4 @@ -# This benchmark defines a sea-of-nodes SSA IR, creates a basic +# This benchmark defines a simple SSA IR, creates a basic # block on the GPU and applies the constant folding optimization # to it. From 727a9281dc1113d5c059c032e1cca87735b71d8d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 9 Apr 2019 17:15:27 +0200 Subject: [PATCH 089/146] Write benchmark results to a CSV --- gc-benchmarks/binary-tree.jl | 2 +- gc-benchmarks/linked-list.jl | 2 +- gc-benchmarks/run-all.jl | 17 ++++++++++++++++- gc-benchmarks/ssa-opt.jl | 2 +- gc-benchmarks/utils.jl | 9 +++++---- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/gc-benchmarks/binary-tree.jl b/gc-benchmarks/binary-tree.jl index 99d98afc..e7b3e46d 100644 --- a/gc-benchmarks/binary-tree.jl +++ b/gc-benchmarks/binary-tree.jl @@ -165,4 +165,4 @@ function bintree_benchmark() @test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence]) end -@cuda_benchmark "binary-tree" bintree_benchmark() +@cuda_benchmark "binary tree" bintree_benchmark() diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl index 84f76fc5..762bf6de 100644 --- a/gc-benchmarks/linked-list.jl +++ b/gc-benchmarks/linked-list.jl @@ -79,4 +79,4 @@ function linkedlist_benchmark() @test Mem.download(Int64, destination_array, LinkedList.thread_count) == repeat([sum(1:LinkedList.element_count)], LinkedList.thread_count) end -@cuda_benchmark "linked-list" linkedlist_benchmark() +@cuda_benchmark "linked list" linkedlist_benchmark() diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 256fbe38..ca4ab50b 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -8,4 +8,19 @@ include("linked-list.jl") include("matrix.jl") include("ssa-opt.jl") -println(run_benchmarks()) +results = run_benchmarks() +# Print the results to the terminal. +println(results) + +# Also write them to a CSV for further analysis. +open("results.csv", "w") do file + write(file, "benchmark,nogc,gc,ratio\n") + for key in sort([k for k in keys(results)]) + runs = results[key] + median_times = BenchmarkTools.median(runs) + gc_time = median_times["gc"].time / 1e6 + nogc_time = median_times["nogc"].time / 1e6 + ratio = gc_time / nogc_time + write(file, "$key,$nogc_time,$gc_time,$ratio\n") + end +end diff --git a/gc-benchmarks/ssa-opt.jl b/gc-benchmarks/ssa-opt.jl index e499e543..b7c10238 100644 --- a/gc-benchmarks/ssa-opt.jl +++ b/gc-benchmarks/ssa-opt.jl @@ -97,4 +97,4 @@ function ssaopt_benchmark() @cuda_sync threads=SSAOpt.thread_count SSAOpt.kernel() end -@cuda_benchmark "ssa-opt" ssaopt_benchmark() +@cuda_benchmark "ssa opt" ssaopt_benchmark() diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index e8d21900..5623bb02 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -51,17 +51,18 @@ end suite = BenchmarkTools.BenchmarkGroup() -function register_cuda_benchmark(f, name) - suite[name] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 +function register_cuda_benchmark(f, name, config) + suite[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90 end macro cuda_benchmark(name, ex) esc(quote - register_cuda_benchmark($name * "-gc") do + suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "nogc"]) + register_cuda_benchmark($name, "gc") do global use_gc = true $(ex) end - register_cuda_benchmark($name * "-nogc") do + register_cuda_benchmark($name, "nogc") do global use_gc = false $(ex) end From fabdea9e7009c1c5aa9cfcc28d8a6da0513900e8 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 10 Apr 2019 15:48:46 +0200 Subject: [PATCH 090/146] Add two additional GC benchmarks --- gc-benchmarks/genetic-algorithm.jl | 179 +++++++++++++++++++++++++++++ gc-benchmarks/linked-list.jl | 41 ++++++- gc-benchmarks/run-all.jl | 4 +- gc-benchmarks/stream-queries.jl | 31 +++++ 4 files changed, 252 insertions(+), 3 deletions(-) create mode 100644 gc-benchmarks/genetic-algorithm.jl create mode 100644 gc-benchmarks/stream-queries.jl diff --git a/gc-benchmarks/genetic-algorithm.jl b/gc-benchmarks/genetic-algorithm.jl new file mode 100644 index 00000000..69a7fcac --- /dev/null +++ b/gc-benchmarks/genetic-algorithm.jl @@ -0,0 +1,179 @@ +module GeneticAlgorithm + +# This benchmark runs a genetic algorithm on the GPU. +# The population is stored in linked lists and characters +# are stored in heap memory. + +using CUDAnative, CUDAdrv +import ..LinkedList: List, Nil, Cons, foldl, map, max + +# A character in our genetic algorithm, based loosely on Fallout's SPECIAL system. +mutable struct Character + strength::Int + perception::Int + endurance::Int + charisma::Int + intelligence::Int + agility::Int + luck::Int +end + +# A linear congruential pseudo-random number generator. +mutable struct LinearCongruentialGenerator + modulus::Int + a::Int + c::Int + state::Int +end + +LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed) + +# Requests a pseudo-random number. +function next(generator::LinearCongruentialGenerator)::Int + generator.state = (generator.a * generator.state + generator.c) % generator.modulus + generator.state +end + +# Requests a pseudo-random number that is at least as great as `lower` +# and less than `upper`. +function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int + lower + next(generator) % (upper - lower) +end + +# Computes the mean of two integers. +function mean(a::Int, b::Int)::Int + div(a + b, 2) +end + +function crossover(parent_one::Character, parent_two::Character)::Character + Character( + mean(parent_one.strength, parent_two.strength), + mean(parent_one.perception, parent_two.perception), + mean(parent_one.endurance, parent_two.endurance), + mean(parent_one.charisma, parent_two.charisma), + mean(parent_one.intelligence, parent_two.intelligence), + mean(parent_one.agility, parent_two.agility), + mean(parent_one.luck, parent_two.luck)) +end + +function mutate_stat(value::Int, generator::LinearCongruentialGenerator)::Int + new_stat = value + next(generator, -2, 3) + if new_stat > 10 + return 10 + elseif new_stat < 0 + return 0 + else + return new_stat + end +end + +function mutate(original::Character, generator::LinearCongruentialGenerator)::Character + Character( + mutate_stat(original.strength, generator), + mutate_stat(original.perception, generator), + mutate_stat(original.endurance, generator), + mutate_stat(original.charisma, generator), + mutate_stat(original.intelligence, generator), + mutate_stat(original.agility, generator), + mutate_stat(original.luck, generator)) +end + +function random_character(generator::LinearCongruentialGenerator)::Character + Character( + next(generator, 0, 11), + next(generator, 0, 11), + next(generator, 0, 11), + next(generator, 0, 11), + next(generator, 0, 11), + next(generator, 0, 11), + next(generator, 0, 11)) +end + +# Computes the fitness of a character. +function fitness(individual::Character)::Float64 + # Compute the character's cost, i.e., the sum of their stats. + cost = Float64(individual.strength + + individual.perception + + individual.endurance + + individual.charisma + + individual.intelligence + + individual.agility + + individual.luck) + + # Compute the character's true fitness, i.e., how well we expect + # the character to perform. + true_fitness = 0.0 + + function stat_fitness(stat::Int)::Float64 + if stat >= 5 + # Linear returns for stats greater than five. + return Float64(stat) + else + # Very low stats make for a poor character build. + return Float64(stat * stat) / 25.0 + end + end + + # Evaluate stats. + true_fitness += stat_fitness(individual.strength) + true_fitness += stat_fitness(individual.perception) + true_fitness += stat_fitness(individual.endurance) + true_fitness += stat_fitness(individual.charisma) + true_fitness += stat_fitness(individual.intelligence) + true_fitness += stat_fitness(individual.agility) + true_fitness += stat_fitness(individual.luck) + + # We like charisma, intelligence and luck. + true_fitness += Float64(individual.charisma) + true_fitness += Float64(individual.intelligence) + true_fitness += Float64(individual.luck) + + true_fitness - cost + 100.0 +end + +function fittest(population::List{Character})::Character + max(fitness, population, Character(0, 0, 0, 0, 0, 0, 0)) +end + +function step(population::List{Character}, generator::LinearCongruentialGenerator)::List{Character} + # Find the fittest individual in the population. + best = fittest(population) + # Do a bunch of crossovers and mutate the resulting population. + map(x -> mutate(crossover(best, x), generator), population) +end + +function genetic_algo(seed::Int)::Character + generator = LinearCongruentialGenerator(seed) + + # Generate some random characters. + individuals = Nil{Character}() + for j in 1:10 + individuals = Cons{Character}(random_character(generator), individuals) + end + + # Run the genetic algorithm for a few iterations. + for j in 1:10 + individuals = step(individuals, generator) + end + + # Find the best individual in the population. + fittest(individuals) +end + +const thread_count = 256 + +function kernel(results::CUDAnative.DevicePtr{Float64}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + fittest_individual = genetic_algo(i) + unsafe_store!(results, fitness(fittest_individual), i) +end + +end + +function genetic_benchmark() + destination_array = Mem.alloc(Float64, GeneticAlgorithm.thread_count) + destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array) + @cuda_sync threads=GeneticAlgorithm.thread_count GeneticAlgorithm.kernel(destination_pointer) +end + +@cuda_benchmark "genetic algo" genetic_benchmark() diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl index 762bf6de..64810ead 100644 --- a/gc-benchmarks/linked-list.jl +++ b/gc-benchmarks/linked-list.jl @@ -1,8 +1,7 @@ module LinkedList using CUDAnative, CUDAdrv -using Test -import Base: foldl, reduce, sum +import Base: foldl, reduce, sum, max, map, reverse, filter # This benchmark constructs a linked list in a GPU kernel. # In doing so, it stresses the allocator's ability to quickly @@ -49,6 +48,44 @@ function sum(list::List{T}) where T reduce(+, list; init=zero(T)) end +function map_reverse(f::Function, list::List{T})::List{T} where T + foldl(list; init=Nil{T}()) do accumulator, value + Cons{T}(f(value), accumulator) + end +end + +function reverse(list::List{T})::List{T} where T + map_reverse(x -> x, list) +end + +function map(f::Function, list::List{T})::List{T} where T + reverse(map_reverse(f, list)) +end + +function max(evaluate::Function, list::List{T}, default_value::T)::T where T + foldl(list; init=default_value) do max_elem, elem + if evaluate(max_elem) < evaluate(elem) + elem + else + max_elem + end + end +end + +function filter_reverse(f::Function, list::List{T})::List{T} where T + foldl(list; init=Nil{T}()) do accumulator, value + if f(value) + Cons{T}(value, accumulator) + else + accumulator + end + end +end + +function filter(f::Function, list::List{T})::List{T} where T + reverse(filter_reverse(f, list)) +end + const element_count = 1000 const thread_count = 32 diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index ca4ab50b..71d8f4fc 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -1,4 +1,4 @@ -using CUDAdrv, CUDAnative +using CUDAdrv, CUDAnative, Test include("utils.jl") @@ -7,6 +7,8 @@ include("binary-tree.jl") include("linked-list.jl") include("matrix.jl") include("ssa-opt.jl") +include("stream-queries.jl") +include("genetic-algorithm.jl") results = run_benchmarks() # Print the results to the terminal. diff --git a/gc-benchmarks/stream-queries.jl b/gc-benchmarks/stream-queries.jl new file mode 100644 index 00000000..0c992d5f --- /dev/null +++ b/gc-benchmarks/stream-queries.jl @@ -0,0 +1,31 @@ +module StreamQueries + +using CUDAnative, CUDAdrv +import ..LinkedList: List, Nil, Cons, foldl, map, max, filter + +# This benchmark applies stream operators (map, max,filter) to purely +# functional lists. + +const thread_count = 256 +const input_size = 100 + +function kernel(input::CUDAnative.DevicePtr{Float64}, output::CUDAnative.DevicePtr{Float64}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + values = List{Float64}(input, input_size) + values = map(x -> x * x, values) + values = filter(x -> x < 10.0 && x >= 0.0, values) + unsafe_store!(output, max(x -> x, values, 0.0), i) +end + +end + +function stream_benchmark() + source_array = Mem.alloc(Float64, StreamQueries.input_size) + Mem.upload!(source_array, rand(Float64, StreamQueries.input_size)) + destination_array = Mem.alloc(Float64, StreamQueries.thread_count) + source_pointer = Base.unsafe_convert(CuPtr{Float64}, source_array) + destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array) + @cuda_sync threads=StreamQueries.thread_count StreamQueries.kernel(source_pointer, destination_pointer) +end + +@cuda_benchmark "stream queries" stream_benchmark() From 8dba84df444d0101230618bc94c46dee4e6ef481 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 15 Apr 2019 16:44:54 +0200 Subject: [PATCH 091/146] Support creating one-dimensional arrays --- examples/gpu-array.jl | 18 +++ src/compiler/optim.jl | 281 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 290 insertions(+), 9 deletions(-) create mode 100644 examples/gpu-array.jl diff --git a/examples/gpu-array.jl b/examples/gpu-array.jl new file mode 100644 index 00000000..ce97b4cc --- /dev/null +++ b/examples/gpu-array.jl @@ -0,0 +1,18 @@ +using CUDAdrv, CUDAnative, StaticArrays, InteractiveUtils + +# This example allocates an array in a GPU kernel. + +const thread_count = 64 + +@noinline function escape(value) + Base.pointer_from_objref(value) + value +end + +function kernel() + array = [1, 2, 3, 4, 5, 6, 7] + escape(array) + return +end + +@cuda_gc threads=thread_count kernel() diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index cf5e8da3..3dfae4d3 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -73,6 +73,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int if job.gc add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry))) add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!)) + add!(pm, FunctionPass("LowerArrays", lower_array_calls!)) else add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!)) end @@ -518,6 +519,45 @@ function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module) return changed end +# Emits instructions that allocate a particular number of bytes +# of GC-managed memory. No headroom is included. No tags are set. +function new_bytes!(builder::LLVM.Builder, size) + call!(builder, Runtime.get(:gc_malloc_object), [size]) +end + +# Emits instructions that allocate bytes for an object, including +# headroom for the object's tag. Also fills in the object's tag if +# one is provided. +function new_object!(builder::LLVM.Builder, size, tag::Union{Type, Nothing} = nothing) + # We need to reserve a single pointer of headroom for the tag. + # (LateLowerGCFrame depends on us doing that.) + headroom = Runtime.tag_size + + # Call the allocation function and bump the resulting pointer + # so the headroom sits just in front of the returned pointer. + total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext())) + obj_ptr = new_bytes!(builder, total_size) + + jl_value_t = llvmtype(obj_ptr) + T_bitcast = LLVM.PointerType(jl_value_t, LLVM.addrspace(jl_value_t)) + + ptr = bitcast!(builder, obj_ptr, T_bitcast) + if tag != nothing + # Fill in the tag if we have one. + store!( + builder, + inttoptr!( + builder, + ConstantInt( + convert(LLVMType, Int64), + Int64(pointer_from_objref(tag))), + jl_value_t), + ptr) + end + bumped_ptr = gep!(builder, ptr, [ConstantInt(Int32(1), JuliaContext())]) + return bitcast!(builder, bumped_ptr, jl_value_t) +end + """ lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) @@ -533,10 +573,6 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) # store for an object, including headroom, but does not set the object's # tag. visit_calls_to("julia.gc_alloc_bytes", mod) do call, gc_alloc_bytes - gc_alloc_bytes_ft = eltype(llvmtype(gc_alloc_bytes))::LLVM.FunctionType - T_ret = return_type(gc_alloc_bytes_ft)::LLVM.PointerType - T_bitcast = LLVM.PointerType(T_ret, LLVM.addrspace(T_ret)) - # Decode the call. ops = collect(operands(call)) size = ops[2] @@ -549,11 +585,7 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) # so the headroom sits just in front of the returned pointer. let builder = Builder(JuliaContext()) position!(builder, call) - total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext())) - ptr = call!(builder, Runtime.get(:gc_malloc_object), [total_size]) - cast_ptr = bitcast!(builder, ptr, T_bitcast) - bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())]) - result_ptr = bitcast!(builder, bumped_ptr, T_ret) + result_ptr = new_object!(builder, size) replace_uses!(call, result_ptr) unsafe_delete!(LLVM.parent(call), call) dispose(builder) @@ -709,6 +741,237 @@ function insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function) return true end +# Tries to evaluate an LLVM IR constant as a literal pointer. +function to_literal_pointer(value)::Tuple{Bool, Ptr{Cvoid}} + if !isa(value, LLVM.ConstantExpr) + return (false, C_NULL) + end + + if !occursin("inttoptr", string(value)) + return (false, C_NULL) + end + + # Peel off addrspacecast and inttoptr. + ptr_arg = value + while occursin("addrspacecast", string(ptr_arg)) || occursin("inttoptr", string(ptr_arg)) + ptr_arg = first(operands(ptr_arg)) + end + ptr_val = convert(Int, ptr_arg) + (true, Ptr{Cvoid}(ptr_val)) +end + +# Visits all calls to literal pointers in a function. +function visit_literal_pointer_calls(visit_call::Function, fun::LLVM.Function) + for block in blocks(fun) + for call in instructions(block) + if !isa(call, LLVM.CallInst) + continue + end + + callee = called_value(call) + if !isa(callee, LLVM.ConstantExpr) + continue + end + + # detect calls to literal pointers + # FIXME: can we detect these properly? + # FIXME: jl_apply_generic and jl_invoke also have such arguments + is_ptr, ptr = to_literal_pointer(callee) + if is_ptr + # look it up in the Julia JIT cache + frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0) + if length(frames) >= 1 + # @compiler_assert length(frames) == 1 job frames=frames + fn, file, line, linfo, fromC, inlined, ip = last(frames) + visit_call(call, fn) + end + end + end + end +end + +# Emits instructions that create a new array. The array's element type +# must be statically known. Its dimensions are represented as a tuple +# of LLVM IR values. A pointer to the new array is returned. +function new_array!(builder::LLVM.Builder, array_type::Type, dims::Tuple) + # Since time immemorial, the structure of an array is (quoting from the + # Julia source code here): + # + # typedef struct { + # /* + # how - allocation style + # 0 = data is inlined, or a foreign pointer we don't manage + # 1 = julia-allocated buffer that needs to be marked + # 2 = malloc-allocated pointer this array object manages + # 3 = has a pointer to the object that owns the data + # */ + # uint16_t how:2; + # uint16_t ndims:10; + # uint16_t pooled:1; + # uint16_t ptrarray:1; // representation is pointer array + # uint16_t isshared:1; // data is shared by multiple Arrays + # uint16_t isaligned:1; // data allocated with memalign + # } jl_array_flags_t; + # + # JL_EXTENSION typedef struct { + # JL_DATA_TYPE + # void *data; + # #ifdef STORE_ARRAY_LEN + # size_t length; + # #endif + # jl_array_flags_t flags; + # uint16_t elsize; + # uint32_t offset; // for 1-d only. does not need to get big. + # size_t nrows; + # union { + # // 1d + # size_t maxsize; + # // Nd + # size_t ncols; + # }; + # // other dim sizes go here for ndims > 2 + # + # // followed by alignment padding and inline data, or owner pointer + # } jl_array_t; + # + # where `STORE_ARRAY_LEN` is a preprocessor directive that is technically a + # "configuration option." AFAICT, `STORE_ARRAY_LEN` is just always defined in + # practice. + # + # The Julia compiler is more than happy to eagerly generate code that accesses + # fields of this data structure directly, so we can't invent our own array data + # structure. Consequently, we will emit code here that carefully constructs + # an instance of `jl_array_t`. + # + # To keep things tidy, we'll construct an array (ironic, I know) that contains the + # values we'll assign to each field of the array. After that, we will generate + # code that fills in every field in one fell swoop. + + fields = [] + + # Compute the size of the element type. + element_type = eltype(array_type) + llvm_element_type = convert(LLVMType, element_type, true) + mod = LLVM.parent(LLVM.parent(position(builder))) + layout = datalayout(mod) + element_size = Csize_t(sizeof(layout, llvm_element_type)) + + # Compute the number of elements in the array. + element_count = LLVM.ConstantInt(convert(LLVMType, Csize_t), 1) + for i in dims + element_count = mul!(builder, element_count, intcast!(builder, i, convert(LLVMType, Csize_t))) + end + + # Compute the size of the array's elements in bytes. + data_bytesize = mul!( + builder, + LLVM.ConstantInt(convert(LLVMType, Csize_t), element_size), + element_count) + + # Actually allocate the array's contents. We will just always + # use a separate buffer. Inline data storage is wasteful and + # harder to implement. + data_ptr = new_bytes!(builder, data_bytesize) + + # The pointer to the array's data is the first field of the struct. + push!(fields, data_ptr) + + # The array's length (i.e., the product of its dimensions) is the + # second field of the `jl_array_t` struct. + push!(fields, element_count) + + # Synthesize a constant that represents the array's flags. + flags = Int16(0) + # Set the 'how' field to one. + flags |= Int16(1) + # Set the 'nDims' field. + flags <<= 10 + flags |= Int16(length(dims)) + # Set the 'pooled' field to `false`. + flags <<= 1 + flags |= Int16(false) + # Set the 'ptrarray' field. + flags <<= 1 + flags |= Int16(isa(llvm_element_type, LLVM.PointerType)) + # Set the 'isshared' field to `false`. + flags <<= 1 + flags |= Int16(false) + # Set the 'isaligned' field to `true`. + flags <<= 1 + flags |= Int16(true) + # Add the flags to the `jl_array_t` struct. + push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), flags)) + + # Set the 'offset' field to zero (the array is not a slice). + push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), Int16(0))) + + if length(dims) == 1 + # Set the 'nrows' field to the number of elements. + push!(fields, element_count) + # Ditto for the 'maxsize' field. + push!(fields, element_count) + else + # If we're creating a multi-dimensional array, then the + # process is slightly different. + for i in dims + push!(fields, intcast!(builder, i, convert(LLVMType, Csize_t))) + end + end + + # Synthesize a struct type that neatly represents the data we want + # to store. + struct_type = LLVM.StructType([llvmtype(f) for f in fields]) + + # We now know exactly what data we want to store in each field of the + # array's control structure. + # All that's left is to actually allocate the array and write that data + # to the control structure. + obj_ptr = new_object!( + builder, + ConstantInt(convert(LLVMType, Csize_t), sizeof(layout, struct_type)), + array_type) + struct_ptr = bitcast!( + builder, + addrspacecast!( + builder, + obj_ptr, + LLVM.PointerType(eltype(llvmtype(obj_ptr)))), + LLVM.PointerType(struct_type)) + + for i in 1:length(fields) + val = fields[i] + gep = struct_gep!(builder, struct_ptr, i - 1) + store!(builder, val, gep) + end + + return obj_ptr +end + +# Lowers function calls that pertain to array operations. +function lower_array_calls!(fun::LLVM.Function) + changed_any = false + visit_literal_pointer_calls(fun) do call, name + if name == :jl_alloc_array_1d + args = collect(operands(call))[1:end - 1] + is_ptr, array_type_ptr = to_literal_pointer(args[1]) + if is_ptr + # We can lower array creation calls if we know the type + # of the array to create in advance. + array_type = unsafe_pointer_to_objref(array_type_ptr) + let builder = Builder(JuliaContext()) + position!(builder, call) + new_array = new_array!(builder, array_type, (args[2],)) + replace_uses!(call, new_array) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + end + changed_any = true + end + end + return changed_any +end + # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible. # # this assumes and checks that the TLS is unused, which should be the case for most GPU code From 6c4450f0808a0cb1fe07d122ff12533393fa718f Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 15 Apr 2019 16:57:56 +0200 Subject: [PATCH 092/146] Rename "arrays" benchmark to "static-arrays" --- gc-benchmarks/run-all.jl | 2 +- gc-benchmarks/{arrays.jl => static-arrays.jl} | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename gc-benchmarks/{arrays.jl => static-arrays.jl} (89%) diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 71d8f4fc..e6382718 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -2,11 +2,11 @@ using CUDAdrv, CUDAnative, Test include("utils.jl") -include("arrays.jl") include("binary-tree.jl") include("linked-list.jl") include("matrix.jl") include("ssa-opt.jl") +include("static-arrays.jl") include("stream-queries.jl") include("genetic-algorithm.jl") diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/static-arrays.jl similarity index 89% rename from gc-benchmarks/arrays.jl rename to gc-benchmarks/static-arrays.jl index 49326cb6..50c6a3ac 100644 --- a/gc-benchmarks/arrays.jl +++ b/gc-benchmarks/static-arrays.jl @@ -2,7 +2,7 @@ module Arrays using CUDAdrv, CUDAnative, StaticArrays -# This benchmark allocates a variety of differently-sized arrays. +# This benchmark allocates a variety of differently-sized static arrays. # The point of this benchmark is to ascertain how well the GC handles # many differently-sized objects. @@ -50,4 +50,4 @@ function arrays_benchmark() @cuda_sync threads=Arrays.thread_count Arrays.kernel() end -@cuda_benchmark "arrays" arrays_benchmark() +@cuda_benchmark "static arrays" arrays_benchmark() From d88313e7041a939f6cd75a485096f5462bb5ac05 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 15 Apr 2019 17:45:10 +0200 Subject: [PATCH 093/146] Support arrays in regular @cuda code --- src/compiler/optim.jl | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 3dfae4d3..a788a53b 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -73,10 +73,12 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int if job.gc add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry))) add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!)) - add!(pm, FunctionPass("LowerArrays", lower_array_calls!)) + add!(pm, FunctionPass("LowerArraysGPUGC", lower_array_calls_gc!)) else add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!)) + add!(pm, FunctionPass("LowerArraysNoGC", lower_array_calls_nogc!)) end + aggressive_dce!(pm) # remove dead uses of ptls add!(pm, ModulePass("LowerPTLS", lower_ptls!)) @@ -521,14 +523,14 @@ end # Emits instructions that allocate a particular number of bytes # of GC-managed memory. No headroom is included. No tags are set. -function new_bytes!(builder::LLVM.Builder, size) - call!(builder, Runtime.get(:gc_malloc_object), [size]) +function new_bytes!(builder::LLVM.Builder, malloc, size) + call!(builder, malloc, [size]) end # Emits instructions that allocate bytes for an object, including # headroom for the object's tag. Also fills in the object's tag if # one is provided. -function new_object!(builder::LLVM.Builder, size, tag::Union{Type, Nothing} = nothing) +function new_object!(builder::LLVM.Builder, malloc, size, tag::Union{Type, Nothing} = nothing) # We need to reserve a single pointer of headroom for the tag. # (LateLowerGCFrame depends on us doing that.) headroom = Runtime.tag_size @@ -536,7 +538,7 @@ function new_object!(builder::LLVM.Builder, size, tag::Union{Type, Nothing} = no # Call the allocation function and bump the resulting pointer # so the headroom sits just in front of the returned pointer. total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext())) - obj_ptr = new_bytes!(builder, total_size) + obj_ptr = new_bytes!(builder, malloc, total_size) jl_value_t = llvmtype(obj_ptr) T_bitcast = LLVM.PointerType(jl_value_t, LLVM.addrspace(jl_value_t)) @@ -585,7 +587,7 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module) # so the headroom sits just in front of the returned pointer. let builder = Builder(JuliaContext()) position!(builder, call) - result_ptr = new_object!(builder, size) + result_ptr = new_object!(builder, Runtime.get(:gc_malloc_object), size) replace_uses!(call, result_ptr) unsafe_delete!(LLVM.parent(call), call) dispose(builder) @@ -793,7 +795,7 @@ end # Emits instructions that create a new array. The array's element type # must be statically known. Its dimensions are represented as a tuple # of LLVM IR values. A pointer to the new array is returned. -function new_array!(builder::LLVM.Builder, array_type::Type, dims::Tuple) +function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple) # Since time immemorial, the structure of an array is (quoting from the # Julia source code here): # @@ -871,7 +873,7 @@ function new_array!(builder::LLVM.Builder, array_type::Type, dims::Tuple) # Actually allocate the array's contents. We will just always # use a separate buffer. Inline data storage is wasteful and # harder to implement. - data_ptr = new_bytes!(builder, data_bytesize) + data_ptr = new_bytes!(builder, malloc, data_bytesize) # The pointer to the array's data is the first field of the struct. push!(fields, data_ptr) @@ -928,6 +930,7 @@ function new_array!(builder::LLVM.Builder, array_type::Type, dims::Tuple) # to the control structure. obj_ptr = new_object!( builder, + malloc, ConstantInt(convert(LLVMType, Csize_t), sizeof(layout, struct_type)), array_type) struct_ptr = bitcast!( @@ -948,7 +951,7 @@ function new_array!(builder::LLVM.Builder, array_type::Type, dims::Tuple) end # Lowers function calls that pertain to array operations. -function lower_array_calls!(fun::LLVM.Function) +function lower_array_calls!(fun::LLVM.Function, malloc) changed_any = false visit_literal_pointer_calls(fun) do call, name if name == :jl_alloc_array_1d @@ -960,7 +963,7 @@ function lower_array_calls!(fun::LLVM.Function) array_type = unsafe_pointer_to_objref(array_type_ptr) let builder = Builder(JuliaContext()) position!(builder, call) - new_array = new_array!(builder, array_type, (args[2],)) + new_array = new_array!(builder, malloc, array_type, (args[2],)) replace_uses!(call, new_array) unsafe_delete!(LLVM.parent(call), call) dispose(builder) @@ -972,6 +975,14 @@ function lower_array_calls!(fun::LLVM.Function) return changed_any end +function lower_array_calls_gc!(fun::LLVM.Function) + lower_array_calls!(fun, Runtime.get(:gc_malloc_object)) +end + +function lower_array_calls_nogc!(fun::LLVM.Function) + lower_array_calls!(fun, Runtime.get(:gc_pool_alloc)) +end + # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible. # # this assumes and checks that the TLS is unused, which should be the case for most GPU code From ec5290ecb933938b9cbe4d3861ccae59dbe22819 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 15 Apr 2019 17:45:42 +0200 Subject: [PATCH 094/146] Define a new 'arrays' benchmark --- gc-benchmarks/arrays.jl | 51 ++++++++++++++++++++++++++++++ gc-benchmarks/genetic-algorithm.jl | 23 +------------- gc-benchmarks/run-all.jl | 1 + gc-benchmarks/static-arrays.jl | 8 ++--- gc-benchmarks/utils.jl | 26 +++++++++++++++ 5 files changed, 83 insertions(+), 26 deletions(-) create mode 100644 gc-benchmarks/arrays.jl diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl new file mode 100644 index 00000000..ac1fd75f --- /dev/null +++ b/gc-benchmarks/arrays.jl @@ -0,0 +1,51 @@ +module Arrays + +using CUDAdrv, CUDAnative +import ..CUDArandom: LinearCongruentialGenerator, next + +# This benchmark allocates a hierarchy of fairly modest Julia arrays. +# Some arrays remain alive, others become unreachable. This benchmark +# seeks to ascertain the performance of the allocator and garbage collector. + +const thread_count = 64 +const insertion_count = 80 + +@noinline function escape(value) + Base.pointer_from_objref(value) + value +end + +function insert(target::Array{Any, 1}, generator::LinearCongruentialGenerator) + while true + index = next(generator, 1, length(target)) + elem = target[index] + if isa(elem, Array{Any, 1}) + if length(elem) > 0 + target = elem + continue + end + end + + target[index] = Any[Any[] for _ in 1:5] + return + end +end + +function kernel() + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + generator = LinearCongruentialGenerator(i) + toplevel = Any[Any[] for _ in 1:10] + for i in 1:insertion_count + insert(toplevel, generator) + end + return +end + +end + +function arrays_benchmark() + # Run the kernel. + @cuda_sync threads=Arrays.thread_count Arrays.kernel() +end + +@cuda_benchmark "arrays" arrays_benchmark() diff --git a/gc-benchmarks/genetic-algorithm.jl b/gc-benchmarks/genetic-algorithm.jl index 69a7fcac..b4e3aa8a 100644 --- a/gc-benchmarks/genetic-algorithm.jl +++ b/gc-benchmarks/genetic-algorithm.jl @@ -6,6 +6,7 @@ module GeneticAlgorithm using CUDAnative, CUDAdrv import ..LinkedList: List, Nil, Cons, foldl, map, max +import ..CUDArandom: LinearCongruentialGenerator, next # A character in our genetic algorithm, based loosely on Fallout's SPECIAL system. mutable struct Character @@ -18,28 +19,6 @@ mutable struct Character luck::Int end -# A linear congruential pseudo-random number generator. -mutable struct LinearCongruentialGenerator - modulus::Int - a::Int - c::Int - state::Int -end - -LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed) - -# Requests a pseudo-random number. -function next(generator::LinearCongruentialGenerator)::Int - generator.state = (generator.a * generator.state + generator.c) % generator.modulus - generator.state -end - -# Requests a pseudo-random number that is at least as great as `lower` -# and less than `upper`. -function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int - lower + next(generator) % (upper - lower) -end - # Computes the mean of two integers. function mean(a::Int, b::Int)::Int div(a + b, 2) diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index e6382718..1611a509 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -2,6 +2,7 @@ using CUDAdrv, CUDAnative, Test include("utils.jl") +include("arrays.jl") include("binary-tree.jl") include("linked-list.jl") include("matrix.jl") diff --git a/gc-benchmarks/static-arrays.jl b/gc-benchmarks/static-arrays.jl index 50c6a3ac..88fcfa43 100644 --- a/gc-benchmarks/static-arrays.jl +++ b/gc-benchmarks/static-arrays.jl @@ -1,4 +1,4 @@ -module Arrays +module StaticArraysBench using CUDAdrv, CUDAnative, StaticArrays @@ -45,9 +45,9 @@ end end -function arrays_benchmark() +function static_arrays_benchmark() # Run the kernel. - @cuda_sync threads=Arrays.thread_count Arrays.kernel() + @cuda_sync threads=StaticArraysBench.thread_count StaticArraysBench.kernel() end -@cuda_benchmark "static arrays" arrays_benchmark() +@cuda_benchmark "static arrays" static_arrays_benchmark() diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index 5623bb02..aa0df174 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -72,3 +72,29 @@ end function run_benchmarks() BenchmarkTools.run(suite) end + +module CUDArandom + +# A linear congruential pseudo-random number generator. +mutable struct LinearCongruentialGenerator + modulus::Int + a::Int + c::Int + state::Int +end + +LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed) + +# Requests a pseudo-random number. +function next(generator::LinearCongruentialGenerator)::Int + generator.state = (generator.a * generator.state + generator.c) % generator.modulus + generator.state +end + +# Requests a pseudo-random number that is at least as great as `lower` +# and less than `upper`. +function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int + lower + next(generator) % (upper - lower) +end + +end From 47a52b44b5167e87bee31f773e419d5c5e9c7968 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 15 Apr 2019 17:46:22 +0200 Subject: [PATCH 095/146] Rename "gpu-array" example to "stdlib-array" --- examples/{gpu-array.jl => stdlib-array.jl} | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) rename examples/{gpu-array.jl => stdlib-array.jl} (75%) diff --git a/examples/gpu-array.jl b/examples/stdlib-array.jl similarity index 75% rename from examples/gpu-array.jl rename to examples/stdlib-array.jl index ce97b4cc..b5b17cc2 100644 --- a/examples/gpu-array.jl +++ b/examples/stdlib-array.jl @@ -1,4 +1,4 @@ -using CUDAdrv, CUDAnative, StaticArrays, InteractiveUtils +using CUDAdrv, CUDAnative, StaticArrays # This example allocates an array in a GPU kernel. @@ -12,6 +12,8 @@ end function kernel() array = [1, 2, 3, 4, 5, 6, 7] escape(array) + comp = [i * i for i in array] + escape(comp) return end From 92847f0c26ea936155abfe493b4a6b1fc5a18abb Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 15 Apr 2019 17:58:30 +0200 Subject: [PATCH 096/146] Introduce unreachable objects in array benchmark --- gc-benchmarks/arrays.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl index ac1fd75f..81def2d4 100644 --- a/gc-benchmarks/arrays.jl +++ b/gc-benchmarks/arrays.jl @@ -21,8 +21,10 @@ function insert(target::Array{Any, 1}, generator::LinearCongruentialGenerator) elem = target[index] if isa(elem, Array{Any, 1}) if length(elem) > 0 - target = elem - continue + if next(generator, 0, 2) == 0 + target = elem + continue + end end end From 844c5578277ba59351a122d8e0ff4198d626d895 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 15 Apr 2019 18:24:37 +0200 Subject: [PATCH 097/146] Define an array reduction benchmark --- gc-benchmarks/array-reduction.jl | 43 ++++++++++++++++++++++++++++++++ gc-benchmarks/arrays.jl | 5 ---- 2 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 gc-benchmarks/array-reduction.jl diff --git a/gc-benchmarks/array-reduction.jl b/gc-benchmarks/array-reduction.jl new file mode 100644 index 00000000..24d4492e --- /dev/null +++ b/gc-benchmarks/array-reduction.jl @@ -0,0 +1,43 @@ +module ArrayReduction + +using CUDAdrv, CUDAnative + +# This benchmark approximates pi by naively constructing an array comprehension +# for the Madhava–Leibniz series and computing its sum. It does this a few times +# to achieve a respectable run time. + +const thread_count = 256 +const series_length = 200 +const runs = 20 + +function iterative_sum(elements::Array{T})::T where T + result = zero(T) + for i in elements + result += i + end + return result +end + +function kernel(destination) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + unsafe_store!(destination, 0.0, i) + for _ in 1:runs + series = [CUDAnative.pow(-1 / 3.0, Float64(k)) / (2.0 * k + 1.0) for k in 0:series_length] + unsafe_store!(destination, unsafe_load(destination, i) + CUDAnative.sqrt(12.0) * iterative_sum(series), i) + end + return +end + +end + +function array_reduction_benchmark() + destination_array = Mem.alloc(Float64, ArrayReduction.thread_count) + destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array) + + # Run the kernel. + @cuda_sync threads=ArrayReduction.thread_count ArrayReduction.kernel(destination_pointer) + + @test Mem.download(Float64, destination_array, ArrayReduction.thread_count) ≈ ArrayReduction.runs .* fill(Float64(pi), ArrayReduction.thread_count) +end + +@cuda_benchmark "array reduction" array_reduction_benchmark() diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl index 81def2d4..1f247f6c 100644 --- a/gc-benchmarks/arrays.jl +++ b/gc-benchmarks/arrays.jl @@ -10,11 +10,6 @@ import ..CUDArandom: LinearCongruentialGenerator, next const thread_count = 64 const insertion_count = 80 -@noinline function escape(value) - Base.pointer_from_objref(value) - value -end - function insert(target::Array{Any, 1}, generator::LinearCongruentialGenerator) while true index = next(generator, 1, length(target)) From 8a11408c68796641fd74b3c25c92741bf5b93f5e Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 15 Apr 2019 18:27:46 +0200 Subject: [PATCH 098/146] Include array reduction benchmark in "run-all.jl" --- gc-benchmarks/run-all.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 1611a509..272cdf61 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -2,6 +2,7 @@ using CUDAdrv, CUDAnative, Test include("utils.jl") +include("array-reduction.jl") include("arrays.jl") include("binary-tree.jl") include("linked-list.jl") From 187417485186087cf76d9325881a31b1306c3b24 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 19 Apr 2019 15:18:56 +0200 Subject: [PATCH 099/146] Add a bitvector benchmark --- gc-benchmarks/bitvector.jl | 101 +++++++++++++++++++++++++++++++++++++ gc-benchmarks/run-all.jl | 1 + 2 files changed, 102 insertions(+) create mode 100644 gc-benchmarks/bitvector.jl diff --git a/gc-benchmarks/bitvector.jl b/gc-benchmarks/bitvector.jl new file mode 100644 index 00000000..59892e92 --- /dev/null +++ b/gc-benchmarks/bitvector.jl @@ -0,0 +1,101 @@ +module Bitvector + +import Base: +, *, << +using CUDAnative + +# This benchmark performs naive arithmetic on bitvectors. +# The goal of the benchmark is to gauge how GPU-unaware +# standard library code that depends on arrays behaves when +# used in a GPU kernel. + +const thread_count = 256 + +@noinline function escape(value) + Base.pointer_from_objref(value) + value +end + +mutable struct BitInteger{N} + bits::BitVector +end + +function zero(::Type{BitInteger{N}})::BitInteger{N} where N + BitInteger{N}(falses(N)) +end + +function one(::Type{BitInteger{N}})::BitInteger{N} where N + result = falses(N) + result[1] = true + return BitInteger{N}(result) +end + +function +(a::BitInteger{N}, b::BitInteger{N})::BitInteger{N} where N + carry = false + c = falses(N) + for i in 1:N + s = Int(a.bits[i]) + Int(b.bits[i]) + Int(carry) + if s == 1 + carry = false + c[i] = true + elseif s == 2 + carry = true + elseif s == 3 + carry = true + c[i] = true + end + end + return BitInteger{N}(c) +end + +function <<(a::BitInteger{N}, amount::Integer)::BitInteger{N} where N + c = falses(N) + for i in 1:(N - amount) + c[i + amount] = a.bits[i] + end + return BitInteger{N}(c) +end + +function *(a::BitInteger{N}, b::BitInteger{N})::BitInteger{N} where N + c = zero(BitInteger{N}) + for i in 1:N + if a.bits[i] + c += (b << (i - 1)) + end + end + return c +end + +function factorial(::Type{BitInteger{N}}, value::Integer)::BitInteger{N} where N + accumulator = one(BitInteger{N}) + iv = one(BitInteger{N}) + for i in 1:value + accumulator *= iv + iv += one(BitInteger{N}) + end + return accumulator +end + +function to_int(value::BitInteger{N})::Int where N + result = 0 + for i in 1:N + if value.bits[i] + result += (1 << (i - 1)) + end + end + return result +end + +function kernel() + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + factorial(BitInteger{128}, 10) + return +end + +end + +function bitvector_benchmark() + # Run the kernel. + @cuda_sync threads=Bitvector.thread_count Bitvector.kernel() +end + +@cuda_benchmark "bitvector" bitvector_benchmark() diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 272cdf61..7a0ff8f4 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -5,6 +5,7 @@ include("utils.jl") include("array-reduction.jl") include("arrays.jl") include("binary-tree.jl") +include("bitvector.jl") include("linked-list.jl") include("matrix.jl") include("ssa-opt.jl") From de14a7f423b15b7bcdd9e2e49add48fbd38d8864 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 19 Apr 2019 15:35:34 +0200 Subject: [PATCH 100/146] Add a 'malloc' keyword argument to the @cuda macro --- src/compiler/common.jl | 9 +++++++-- src/execution.jl | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/compiler/common.jl b/src/compiler/common.jl index 33232b82..e3281e90 100644 --- a/src/compiler/common.jl +++ b/src/compiler/common.jl @@ -12,6 +12,11 @@ struct CompilerJob maxthreads::Union{Nothing,CuDim} blocks_per_sm::Union{Nothing,Integer} maxregs::Union{Nothing,Integer} + # The name of the 'malloc' function to use when allocating memory. + # A transform will rewrite all calls to 'malloc' to use this function + # instead. The 'malloc' signature must be 'void* malloc(size_t)' or + # compatible. + malloc::String # Indicates whether the GPU GC or the "malloc never free" # GC intrinsic lowering strategy is to be used. The former # is used when this field is `true`; the latter when it is @@ -21,8 +26,8 @@ struct CompilerJob CompilerJob(f, tt, cap, kernel; minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing, maxregs=nothing, - gc=false) = - new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, gc) + malloc="malloc",gc=false) = + new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, malloc, gc) end # global job reference diff --git a/src/execution.jl b/src/execution.jl index f8b902e6..cf61cb74 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -9,7 +9,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize # the code it generates, or the execution function split_kwargs(kwargs) macro_kws = [:dynamic, :init] - compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs] + compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :malloc] call_kws = [:cooperative, :blocks, :threads, :shmem, :stream] macro_kwargs = [] compiler_kwargs = [] From 2908cdd6a36e8bcecc93f275e62af64f1ef0be1f Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 19 Apr 2019 15:43:57 +0200 Subject: [PATCH 101/146] Add a pass that rewrites calls to 'malloc' --- src/compiler/optim.jl | 70 ++++++++++++++++++++++++++++++++++++++++++- src/device/runtime.jl | 9 ++++-- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index a788a53b..0ca0cbfb 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -78,7 +78,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!)) add!(pm, FunctionPass("LowerArraysNoGC", lower_array_calls_nogc!)) end - + aggressive_dce!(pm) # remove dead uses of ptls add!(pm, ModulePass("LowerPTLS", lower_ptls!)) @@ -89,6 +89,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int run!(pm, mod) end + replace_malloc!(mod, job.malloc) end # PTX-specific optimizations @@ -983,6 +984,73 @@ function lower_array_calls_nogc!(fun::LLVM.Function) lower_array_calls!(fun, Runtime.get(:gc_pool_alloc)) end +# Replaces all uses of a function in a particular module with +# a compatible function. +function replace_function!(mod::LLVM.Module, old_name::String, new_name::String; include_oom_check=false) + if new_name == old_name + # There's nothing to replace if the new function is the same as + # the old function. + return false + end + + # Otherwise, we'll try and find the malloc function. + if !haskey(functions(mod), old_name) + # If the old function doesn't even appear in the module, then it's not in + # use and we can stop right here. + return false + end + + old_function = functions(mod)[old_name] + + if haskey(functions(mod), new_name) + new_function = functions(mod)[new_name] + else + # Create a new function. + new_function = LLVM.Function( + mod, + new_name, + eltype(llvmtype(old_function)::LLVM.PointerType)::LLVM.FunctionType) + end + + if include_oom_check + wrapper = LLVM.Function( + mod, + new_name * "_checked", + eltype(llvmtype(new_function)::LLVM.PointerType)::LLVM.FunctionType) + + Builder(JuliaContext()) do builder + entry = BasicBlock(wrapper, "entry", JuliaContext()) + position!(builder, entry) + + result = call!(builder, new_function, collect(parameters(wrapper))) + check_args = LLVM.Value[result] + append!(check_args, parameters(wrapper)) + call!(builder, Runtime.get(:check_out_of_memory), check_args) + ret!(builder, result) + end + + new_function = wrapper + end + + # Replace all uses of the old function with the new function. + replace_uses!(old_function, new_function) + + return true +end + +# Replaces all uses of the malloc function in a particular module with +# a compatible function with the specified name. +function replace_malloc!(mod::LLVM.Module, malloc_name::String) + if malloc_name == "malloc" + # There's nothing to replace if the new malloc is the same as + # the old malloc. + return false + end + + return replace_function!(mod, "malloc", malloc_name) || + replace_function!(mod, "ptx_gc_pool_alloc", malloc_name; include_oom_check=true) +end + # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible. # # this assumes and checks that the TLS is unused, which should be the case for most GPU code diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 6b7c792e..db8c7a6d 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -129,15 +129,20 @@ end function gc_pool_alloc(sz::Csize_t) ptr = malloc(sz) + check_out_of_memory(ptr, sz) + return unsafe_pointer_to_objref(ptr) +end + +function check_out_of_memory(ptr::Ptr{Cvoid}, sz::Csize_t) if ptr == C_NULL @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) throw(OutOfMemoryError()) end - return unsafe_pointer_to_objref(ptr) + return end compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue) - +compile(check_out_of_memory, Cvoid, (Ptr{Cvoid}, Csize_t)) ## boxing and unboxing From f6107eb947fc53e3dfb72b9c544272180809ed54 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 19 Apr 2019 17:31:43 +0200 Subject: [PATCH 102/146] Recompile runtime library for different allocators --- src/compiler/driver.jl | 2 +- src/compiler/rtlib.jl | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 70ea33ba..97b87e7a 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -68,7 +68,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true, # preload libraries if libraries libdevice = load_libdevice(job.cap) - runtime = load_runtime(job.cap) + runtime = load_runtime(job.cap, job.malloc) end need_library(lib) = any(f -> isdeclaration(f) && diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl index ad82f984..8293b098 100644 --- a/src/compiler/rtlib.jl +++ b/src/compiler/rtlib.jl @@ -122,30 +122,30 @@ end ## functionality to build the runtime library -function emit_function!(mod, cap, f, types, name) +function emit_function!(mod, cap, f, types, name, malloc) tt = Base.to_tuple_type(types) # Optimize the module that defines the function, but don't # internalize symbols in that function yet: internalizing # globals may de-alias references to globals in the runtime # library from equivalent references in the kernel. - new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false); + new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false; malloc=malloc); libraries=false, internalize=false) LLVM.name!(entry, name) link!(mod, new_mod) end -function build_runtime(cap) +function build_runtime(cap, malloc) mod = LLVM.Module("CUDAnative run-time library", JuliaContext()) for method in values(Runtime.methods) - emit_function!(mod, cap, method.def, method.types, method.llvm_name) + emit_function!(mod, cap, method.def, method.types, method.llvm_name, malloc) end mod end -function load_runtime(cap) - name = "cudanative.$(cap.major)$(cap.minor).bc" +function load_runtime(cap, malloc) + name = "cudanative.$(malloc).$(cap.major)$(cap.minor).bc" path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name) mkpath(dirname(path)) @@ -155,8 +155,8 @@ function load_runtime(cap) parse(LLVM.Module, read(io), JuliaContext()) end else - @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device, this might take a while..." - lib = build_runtime(cap) + @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device (allocating with '$malloc'), this might take a while..." + lib = build_runtime(cap, malloc) open(path, "w") do io write(io, lib) end From 84ffff5628c6fa5ea2f383901ad1bfdc492bb9cd Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 19 Apr 2019 17:32:20 +0200 Subject: [PATCH 103/146] Use 'gc_malloc' as allocator when @cuda_gc is specified --- src/gc.jl | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 82f5c0e2..00805de7 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -79,6 +79,12 @@ function data_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8} Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(FreeListRecord) end +# Takes a pointer to the first byte of data managed by an allocation record +# and produces a pointer to the record itself. +function record_pointer(data::Ptr{UInt8})::Ptr{FreeListRecord} + Base.unsafe_convert(Ptr{FreeListRecord}, record) - sizeof(FreeListRecord) +end + # Gets a pointer to the first byte of data no longer managed by an allocation record. function data_end_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8} data_pointer(record) + unsafe_load(@get_field_pointer(record, :size)) @@ -753,7 +759,7 @@ function gc_take_any_list_entry( free_list_item = unsafe_load(free_list_ptr) if free_list_item == C_NULL - break + return C_NULL end result = gc_take_list_entry(free_list_ptr, free_list_item, bytesize) @@ -763,7 +769,6 @@ function gc_take_any_list_entry( free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{FreeListRecord}} end - return C_NULL end # Tries to allocate a chunk of memory from a free list. @@ -948,11 +953,17 @@ function gc_transfer_and_malloc( transfer_bytesize::Csize_t, alloc_bytesize::Csize_t)::Ptr{UInt8} - gc_transfer_and_malloc( + result = gc_transfer_and_malloc( from_arena, get_free_list(to_arena), transfer_bytesize, alloc_bytesize) + + writer_locked(get_lock(to_arena)) do + unsafe_store!(@get_field_pointer(to_arena, :can_restock), true) + end + + return result end """ @@ -1439,10 +1450,9 @@ function iterate_allocated(fun::Function, arena::Ptr{FreeListArena}) iterate_allocation_records(fun, allocation_list_head) end -# Iterates through all active allocation records in a GC arena. -function iterate_allocated(fun::Function, arena::Ptr{BodegaArena}) - # Compose a set that contains all data addresses of chunks that - # are on the shelves. +# Composes a set that contains all data addresses of chunks that +# are on the shelves. +function chunks_on_shelves(arena::Ptr{BodegaArena}) arena_data = unsafe_load(arena) chunks_on_shelves = Set{Ptr{UInt8}}() for i in 1:arena_data.shelf_count @@ -1451,11 +1461,17 @@ function iterate_allocated(fun::Function, arena::Ptr{BodegaArena}) push!(chunks_on_shelves, unsafe_load(shelf.chunks, j)) end end + return chunks_on_shelves +end + +# Iterates through all active allocation records in a GC arena. +function iterate_allocated(fun::Function, arena::Ptr{BodegaArena}) + shelf_chunks = chunks_on_shelves(arena) # Now iterate through the allocation list, ignoring records that have # been placed on the shelves. iterate_allocated(get_free_list(arena)) do record - if !(data_pointer(record) in chunks_on_shelves) + if !(data_pointer(record) in shelf_chunks) fun(record) end end @@ -1504,8 +1520,15 @@ end # Frees all dead blocks in an arena. function gc_free_garbage(arena::Ptr{BodegaArena}, live_blocks::Set{Ptr{FreeListRecord}}) + # Mark chunks on shelves as live. + all_live_blocks = Set{Ptr{FreeListRecord}}(live_blocks) + shelf_chunks = chunks_on_shelves(arena) + for chunk_ptr in shelf_chunks + push!(all_live_blocks, record_pointer(chunk_ptr)) + end + # Free garbage in the free list sub-arena. - gc_free_garbage(get_free_list(arena), live_blocks) + gc_free_garbage(get_free_list(arena), all_live_blocks) # Mark the arena as ready for restocking. unsafe_store!(@get_field_pointer(arena, :can_restock), true) @@ -1850,7 +1873,7 @@ macro cuda_gc(ex...) # Standard kernel setup logic. local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),)) local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} - local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; gc = true, $(map(esc, compiler_kwargs)...)) + local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; gc = true, malloc="ptx_gc_malloc", $(map(esc, compiler_kwargs)...)) CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...)) gc_report.elapsed_time = Base.@elapsed begin kernel(kernel_args...; $(map(esc, call_kwargs)...)) From e14b9b336f5016bed02f49474338c7c5cc482d5e Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 19 Apr 2019 17:32:52 +0200 Subject: [PATCH 104/146] Implement array expansion method --- src/compiler/optim.jl | 50 ++++++--------- src/device/runtime.jl | 146 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 158 insertions(+), 38 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 0ca0cbfb..2ce36d1f 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -871,6 +871,12 @@ function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple LLVM.ConstantInt(convert(LLVMType, Csize_t), element_size), element_count) + if element_size == Csize_t(1) && length(dims) == 1 + # If we're allocating an array of bytes, we will throw in an extra + # byte at the end for compatibility with Julia's ABI. + data_bytesize = add!(builder, data_bytesize, LLVM.ConstantInt(convert(LLVMType, Csize_t), 1)) + end + # Actually allocate the array's contents. We will just always # use a separate buffer. Inline data storage is wasteful and # harder to implement. @@ -905,6 +911,9 @@ function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple # Add the flags to the `jl_array_t` struct. push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), flags)) + # Set the 'elsize' field. + push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), Int16(element_size))) + # Set the 'offset' field to zero (the array is not a slice). push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), Int16(0))) @@ -955,8 +964,8 @@ end function lower_array_calls!(fun::LLVM.Function, malloc) changed_any = false visit_literal_pointer_calls(fun) do call, name + args = collect(operands(call))[1:end - 1] if name == :jl_alloc_array_1d - args = collect(operands(call))[1:end - 1] is_ptr, array_type_ptr = to_literal_pointer(args[1]) if is_ptr # We can lower array creation calls if we know the type @@ -971,6 +980,14 @@ function lower_array_calls!(fun::LLVM.Function, malloc) end end changed_any = true + elseif name == :jl_array_grow_end + let builder = Builder(JuliaContext()) + position!(builder, call) + new_call = call!(builder, Runtime.get(name), args) + replace_uses!(call, new_call) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end end end return changed_any @@ -986,7 +1003,7 @@ end # Replaces all uses of a function in a particular module with # a compatible function. -function replace_function!(mod::LLVM.Module, old_name::String, new_name::String; include_oom_check=false) +function replace_function!(mod::LLVM.Module, old_name::String, new_name::String) if new_name == old_name # There's nothing to replace if the new function is the same as # the old function. @@ -1012,26 +1029,6 @@ function replace_function!(mod::LLVM.Module, old_name::String, new_name::String; eltype(llvmtype(old_function)::LLVM.PointerType)::LLVM.FunctionType) end - if include_oom_check - wrapper = LLVM.Function( - mod, - new_name * "_checked", - eltype(llvmtype(new_function)::LLVM.PointerType)::LLVM.FunctionType) - - Builder(JuliaContext()) do builder - entry = BasicBlock(wrapper, "entry", JuliaContext()) - position!(builder, entry) - - result = call!(builder, new_function, collect(parameters(wrapper))) - check_args = LLVM.Value[result] - append!(check_args, parameters(wrapper)) - call!(builder, Runtime.get(:check_out_of_memory), check_args) - ret!(builder, result) - end - - new_function = wrapper - end - # Replace all uses of the old function with the new function. replace_uses!(old_function, new_function) @@ -1041,14 +1038,7 @@ end # Replaces all uses of the malloc function in a particular module with # a compatible function with the specified name. function replace_malloc!(mod::LLVM.Module, malloc_name::String) - if malloc_name == "malloc" - # There's nothing to replace if the new malloc is the same as - # the old malloc. - return false - end - - return replace_function!(mod, "malloc", malloc_name) || - replace_function!(mod, "ptx_gc_pool_alloc", malloc_name; include_oom_check=true) + return replace_function!(mod, "malloc", malloc_name) end # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible. diff --git a/src/device/runtime.jl b/src/device/runtime.jl index db8c7a6d..d5f6f693 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -129,20 +129,14 @@ end function gc_pool_alloc(sz::Csize_t) ptr = malloc(sz) - check_out_of_memory(ptr, sz) - return unsafe_pointer_to_objref(ptr) -end - -function check_out_of_memory(ptr::Ptr{Cvoid}, sz::Csize_t) if ptr == C_NULL @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) throw(OutOfMemoryError()) end - return + return unsafe_pointer_to_objref(ptr) end compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue) -compile(check_out_of_memory, Cvoid, (Ptr{Cvoid}, Csize_t)) ## boxing and unboxing @@ -230,6 +224,8 @@ for (T, t) in [Int8 => :int8, Int16 => :int16, Int32 => :int32, Int64 => end end +## Garbage collection + # LLVM type of a pointer to a tracked pointer function T_pprjlvalue() T_pjlvalue = convert(LLVMType, Any, true) @@ -237,7 +233,8 @@ function T_pprjlvalue() LLVM.PointerType(eltype(T_pjlvalue), Tracked)) end -# Include the GC memory allocation function into the runtime. +# Include GC memory allocation functions into the runtime. +compile(CUDAnative.gc_malloc, Ptr{UInt8}, (Csize_t,)) compile(CUDAnative.gc_malloc_object, Any, (Csize_t,), T_prjlvalue) # Include GC frame management functions into the runtime. @@ -261,4 +258,137 @@ compile( compile(CUDAnative.gc_safepoint, Cvoid, ()) compile(CUDAnative.gc_perma_safepoint, Cvoid, ()) +## Arrays + +# A data structure that carefully mirrors an in-memory array control +# structure for Julia arrays, as laid out by the compiler. +mutable struct Array1D + # This is the data layout for Julia arrays, which we adhere to here. + # + # JL_EXTENSION typedef struct { + # JL_DATA_TYPE + # void *data; + # #ifdef STORE_ARRAY_LEN + # size_t length; + # #endif + # jl_array_flags_t flags; + # uint16_t elsize; + # uint32_t offset; // for 1-d only. does not need to get big. + # size_t nrows; + # union { + # // 1d + # size_t maxsize; + # // Nd + # size_t ncols; + # }; + # // other dim sizes go here for ndims > 2 + # + # // followed by alignment padding and inline data, or owner pointer + # } jl_array_t; + + data::Ptr{UInt8} + length::Csize_t + flags::UInt16 + elsize::UInt16 + offset::UInt32 + nrows::Csize_t + maxsize::Csize_t +end + +function zero_fill!(ptr::Ptr{UInt8}, count::Integer) + for i in 1:count + unsafe_store!(ptr, UInt8(0), count) + end + return +end + +function memmove!(dst::Ptr{UInt8}, src::Ptr{UInt8}, sz::Integer) + if src < dst + for i in 1:sz + unsafe_store!(dst, unsafe_load(src, i), i) + end + else + for i in sz:-1:1 + unsafe_store!(dst, unsafe_load(src, i), i) + end + end +end + +# Resize the buffer to a max size of `newlen` +# The buffer can either be newly allocated or realloc'd, the return +# value is true if a new buffer is allocated and false if it is realloc'd. +# the caller needs to take care of moving the data from the old buffer +# to the new one if necessary. +# When this function returns, the `.data` pointer always points to +# the **beginning** of the new buffer. +function array_resize_buffer(a::Array1D, newlen::Csize_t)::Bool + elsz = Csize_t(a.elsize) + nbytes = newlen * elsz + oldnbytes = a.maxsize * elsz + + if elsz == 1 + nbytes += 1 + oldnbytes += 1 + end + + # Allocate a new buffer. Note that 'malloc' will get replaced with + # the "right" allocation function for the environment in which this + # function is compiled. So if the GC is enabled, then 'malloc' will + # actually call 'gc_malloc'. + a.data = malloc(nbytes) + zero_fill!(a.data + oldnbytes, nbytes - oldnbytes) + a.maxsize = newlen + return true +end + +function jl_array_grow_at_end(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t) + data = a.data + elsz = Csize_t(a.elsize) + reqmaxsize = a.offset + n + inc + has_gap = n > idx + if reqmaxsize > a.maxsize + nb1 = idx * elsz + nbinc = inc * elsz + + if reqmaxsize < 4 + newmaxsize = Csize_t(4) + elseif reqmaxsize >= a.maxsize * 2 + newmaxsize = reqmaxsize + else + newmaxsize = a.maxsize * 2 + end + + newbuf = array_resize_buffer(a, newmaxsize) + newdata = a.data + a.offset * elsz + if newbuf + memmove!(newdata, data, nb1) + if has_gap + memmove!(newdata + nb1 + nbinc, data + nb1, n * elsz - nb1) + end + elseif has_gap + memmove!(newdata + nb1 + nbinc, newdata + nb1, n * elsz - nb1) + end + a.data = data = newdata + end + + newnrows = n + inc + a.length = newnrows + a.nrows = newnrows + zero_fill!(data + idx * elsz, inc * elsz) + return +end + +function jl_array_grow_end(a::Array1D, inc::Csize_t) + n = a.nrows + jl_array_grow_at_end(a, n, inc, n) + return +end + +compile( + jl_array_grow_end, + Cvoid, + (Array1D, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) + end From 68f474727c2f8a3ffbbe5deaa8037db8df4ca322 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 19 Apr 2019 17:33:32 +0200 Subject: [PATCH 105/146] Create an array expansion benchmark --- gc-benchmarks/array-expansion.jl | 46 ++++++++++++++++++++++++++++++++ gc-benchmarks/run-all.jl | 1 + 2 files changed, 47 insertions(+) create mode 100644 gc-benchmarks/array-expansion.jl diff --git a/gc-benchmarks/array-expansion.jl b/gc-benchmarks/array-expansion.jl new file mode 100644 index 00000000..95dc9bb0 --- /dev/null +++ b/gc-benchmarks/array-expansion.jl @@ -0,0 +1,46 @@ +module ArrayExpansion + +using CUDAdrv, CUDAnative + +# This benchmark has every thread create arrays and repeatedly +# append elements to those arrays. + +const thread_count = 256 +const array_length = 200 +const runs = 10 + +function iterative_sum(elements::Array{T})::T where T + result = zero(T) + for i in elements + result += i + end + return result +end + +function kernel(destination) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + result = 0 + for j in 1:runs + array = Int[] + for k in 1:array_length + push!(array, k) + end + result += iterative_sum(array) + end + unsafe_store!(destination, result, i) + return +end + +end + +function array_expansion_benchmark() + destination_array = Mem.alloc(Int, ArrayExpansion.thread_count) + destination_pointer = Base.unsafe_convert(CuPtr{Int}, destination_array) + + # Run the kernel. + @cuda_sync threads=ArrayExpansion.thread_count ArrayExpansion.kernel(destination_pointer) + + @test Mem.download(Int, destination_array, ArrayExpansion.thread_count) == fill(ArrayExpansion.runs * sum(1:ArrayExpansion.array_length), ArrayExpansion.thread_count) +end + +@cuda_benchmark "array expansion" array_expansion_benchmark() diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 7a0ff8f4..c7bb7083 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -2,6 +2,7 @@ using CUDAdrv, CUDAnative, Test include("utils.jl") +include("array-expansion.jl") include("array-reduction.jl") include("arrays.jl") include("binary-tree.jl") From a6b49dc3729e729e42394b53835c016a6f6877a9 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 19 Apr 2019 19:05:50 +0200 Subject: [PATCH 106/146] Introduce a special 'managed_malloc' runtime function --- src/compiler/common.jl | 8 ++++---- src/compiler/optim.jl | 6 +++--- src/device/runtime.jl | 21 ++++++++++++++++----- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/compiler/common.jl b/src/compiler/common.jl index e3281e90..cdecb6b1 100644 --- a/src/compiler/common.jl +++ b/src/compiler/common.jl @@ -12,10 +12,10 @@ struct CompilerJob maxthreads::Union{Nothing,CuDim} blocks_per_sm::Union{Nothing,Integer} maxregs::Union{Nothing,Integer} - # The name of the 'malloc' function to use when allocating memory. - # A transform will rewrite all calls to 'malloc' to use this function - # instead. The 'malloc' signature must be 'void* malloc(size_t)' or - # compatible. + # The name of the memory allocation function to use when allocating + # managed memory. A transform will rewrite all managed memory allocations + # to use this function instead. The 'malloc' signature must be + # 'void* malloc(size_t)' or compatible. malloc::String # Indicates whether the GPU GC or the "malloc never free" # GC intrinsic lowering strategy is to be used. The former diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 2ce36d1f..d28176a2 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -1035,10 +1035,10 @@ function replace_function!(mod::LLVM.Module, old_name::String, new_name::String) return true end -# Replaces all uses of the malloc function in a particular module with -# a compatible function with the specified name. +# Replaces all uses of the managed memory allocation function in a +# particular module with a compatible function with the specified name. function replace_malloc!(mod::LLVM.Module, malloc_name::String) - return replace_function!(mod, "malloc", malloc_name) + return replace_function!(mod, "ptx_managed_malloc", malloc_name) end # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible. diff --git a/src/device/runtime.jl b/src/device/runtime.jl index d5f6f693..278aee28 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -127,8 +127,18 @@ function T_prjlvalue() LLVM.PointerType(eltype(T_pjlvalue), Tracked) end +# A function that gets replaced by the proper 'malloc' implementation +# for the context it executes in. When the GC is used, calls to this +# function are replaced with 'gc_malloc'; otherwise, this function gets +# rewritten as a call to the allocator, probably 'malloc'. +@noinline function managed_malloc(sz::Csize_t) + malloc(sz) +end + +compile(managed_malloc, Ptr{UInt8}, (Csize_t,)) + function gc_pool_alloc(sz::Csize_t) - ptr = malloc(sz) + ptr = managed_malloc(sz) if ptr == C_NULL @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) throw(OutOfMemoryError()) @@ -331,11 +341,12 @@ function array_resize_buffer(a::Array1D, newlen::Csize_t)::Bool oldnbytes += 1 end - # Allocate a new buffer. Note that 'malloc' will get replaced with + # Allocate a new buffer. 'managed_malloc' will get replaced with # the "right" allocation function for the environment in which this - # function is compiled. So if the GC is enabled, then 'malloc' will - # actually call 'gc_malloc'. - a.data = malloc(nbytes) + # function is compiled. So if the GC is enabled, then 'managed_malloc' + # will actually call 'gc_malloc'; otherwise, it's probably going to + # be 'malloc'. + a.data = managed_malloc(nbytes) zero_fill!(a.data + oldnbytes, nbytes - oldnbytes) a.maxsize = newlen return true From 133101f9dbb7cc101ae958b81929ef2cb18a3768 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Sun, 21 Apr 2019 15:08:39 +0200 Subject: [PATCH 107/146] Implement 'managed_malloc' differently --- src/compiler/optim.jl | 4 ++-- src/device/runtime.jl | 27 ++++++++++++++++++++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index d28176a2..7b426152 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -1010,7 +1010,7 @@ function replace_function!(mod::LLVM.Module, old_name::String, new_name::String) return false end - # Otherwise, we'll try and find the malloc function. + # Otherwise, we'll try and find the old function. if !haskey(functions(mod), old_name) # If the old function doesn't even appear in the module, then it's not in # use and we can stop right here. @@ -1038,7 +1038,7 @@ end # Replaces all uses of the managed memory allocation function in a # particular module with a compatible function with the specified name. function replace_malloc!(mod::LLVM.Module, malloc_name::String) - return replace_function!(mod, "ptx_managed_malloc", malloc_name) + return replace_function!(mod, "julia.managed_malloc", malloc_name) end # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible. diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 278aee28..d081686d 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -131,14 +131,31 @@ end # for the context it executes in. When the GC is used, calls to this # function are replaced with 'gc_malloc'; otherwise, this function gets # rewritten as a call to the allocator, probably 'malloc'. -@noinline function managed_malloc(sz::Csize_t) - malloc(sz) -end +@generated function managed_malloc(sz::Csize_t) + T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext())) + T_size = convert(LLVMType, Csize_t) + T_ptr = convert(LLVMType, Ptr{UInt8}) + + # create function + llvm_f, _ = create_function(T_ptr, [T_size]) + mod = LLVM.parent(llvm_f) + + intr = LLVM.Function(mod, "julia.managed_malloc", LLVM.FunctionType(T_pint8, [T_size])) + + # generate IR + Builder(JuliaContext()) do builder + entry = BasicBlock(llvm_f, "entry", JuliaContext()) + position!(builder, entry) + ptr = call!(builder, intr, [parameters(llvm_f)[1]]) + jlptr = ptrtoint!(builder, ptr, T_ptr) + ret!(builder, jlptr) + end -compile(managed_malloc, Ptr{UInt8}, (Csize_t,)) + call_function(llvm_f, Ptr{UInt8}, Tuple{Csize_t}, :((sz,))) +end function gc_pool_alloc(sz::Csize_t) - ptr = managed_malloc(sz) + ptr = malloc(sz) if ptr == C_NULL @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) throw(OutOfMemoryError()) From 73018f5a64685a5b9168c719475e15f4d1c32af8 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 25 Apr 2019 11:20:01 +0200 Subject: [PATCH 108/146] Consider custom malloc during IR checking --- src/compiler/validation.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl index e405b8d9..0fbae515 100644 --- a/src/compiler/validation.jl +++ b/src/compiler/validation.jl @@ -118,7 +118,7 @@ function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst) fn = LLVM.name(dest) # detect calls to undefined functions - if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns) + if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns) && fn != job.malloc # figure out if the function lives in the Julia runtime library if libjulia[] == C_NULL paths = filter(Libdl.dllist()) do path From d505dad208a6eb0c21683783d86b15eee90fa8d4 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Sun, 5 May 2019 16:18:39 +0200 Subject: [PATCH 109/146] Switch to acquire-release semantics for atomics --- src/device/threading.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/device/threading.jl b/src/device/threading.jl index 846db990..96e58f72 100644 --- a/src/device/threading.jl +++ b/src/device/threading.jl @@ -10,7 +10,7 @@ export ReaderWriterLock, reader_locked, writer_locked, Mutex, try_lock, unlock lt = string(convert(LLVMType, T)) ir = """ %ptr = inttoptr $ptr_type %0 to $lt* - %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst + %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 acq_rel acquire %rv = extractvalue { $lt, i1 } %result, 0 ret $lt %rv """ @@ -22,7 +22,7 @@ end lt = string(convert(LLVMType, T)) ir = """ %ptr = inttoptr $ptr_type %0 to $lt* - %rv = atomicrmw volatile $(String(op)) $lt* %ptr, $lt %1 seq_cst + %rv = atomicrmw volatile $(String(op)) $lt* %ptr, $lt %1 acq_rel ret $lt %rv """ :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T}, lhs, rhs)) From 1aad738a46f8211465e142330c122de37526a997 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 6 May 2019 16:07:19 +0200 Subject: [PATCH 110/146] Expose GC configuration options --- src/execution.jl | 4 +- src/gc.jl | 134 +++++++++++++++++++++++++++++++---------------- 2 files changed, 90 insertions(+), 48 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index cf61cb74..fc930c7c 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -8,7 +8,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize # split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and # the code it generates, or the execution function split_kwargs(kwargs) - macro_kws = [:dynamic, :init] + macro_kws = [:dynamic, :init, :gc_config] compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :malloc] call_kws = [:cooperative, :blocks, :threads, :shmem, :stream] macro_kwargs = [] @@ -450,7 +450,7 @@ functionality is included in [`@cuda`](@ref). The 'init' keyword argument is a function that takes a kernel as argument and sets up an environment for the kernel. """ -function prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel) where {F,TT} +function prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel, kw...) where {F,TT} # Just call the 'init' function for now. init(kernel) end diff --git a/src/gc.jl b/src/gc.jl index 00805de7..a116a687 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -43,7 +43,7 @@ # * When the device runs out of GC memory, it requests an interrupt # to mark and sweep. -export @cuda_gc, gc_malloc, gc_malloc_object, gc_collect, gc_safepoint +export @cuda_gc, gc_malloc, gc_malloc_object, gc_collect, gc_safepoint, GCConfiguration import Base: length, show import Printf: @sprintf @@ -1126,30 +1126,6 @@ end # One megabyte. const MiB = 1 << 20 -# The initial size of the GC heap, currently 10 MiB. -const initial_gc_heap_size = 10 * MiB - -# The default capacity of a root buffer, i.e., the max number of -# roots that can be stored per thread. Currently set to -# 256 roots. That's 2 KiB of roots per thread. -const default_root_buffer_capacity = 256 - -# The point at which the global arena is deemed to be starving, i.e., -# it no longer contains enough memory to perform basic allocations. -# If the global arena's free byte count stays below the arena starvation -# threshold after a collection phase, the collector will allocate -# additional memory to the arena such that it is no longer starving. -# The arena starvation threshold is currently set to 4 MiB. -const global_arena_starvation_threshold = 4 * MiB - -# The point at which a local arena is deemed to be starving, i.e., -# it no longer contains enough memory to perform basic allocations. -# If a local arena's free byte count stays below the arena starvation -# threshold after a collection phase, the collector will allocate -# additional memory to the arena such that it is no longer starving. -# The arena starvation threshold is currently set to 1 MiB. -const local_arena_starvation_threshold = 1 * MiB - # The point at which a tiny arena is deemed to be starving, i.e., # it no longer contains enough memory to perform basic allocations. # If a tiny arena's free byte count stays below the arena starvation @@ -1178,17 +1154,77 @@ end GCHeapDescription() = GCHeapDescription([]) +# A data structure that contains GC configuration parameters. +struct GCConfiguration + # The number of local arenas to create. + local_arena_count::Int + + # The max number of roots that can be stored per thread. + root_buffer_capacity::Int + + # The point at which the global arena is deemed to be starving, i.e., + # it no longer contains enough memory to perform basic allocations. + # If the global arena's free byte count stays below the arena starvation + # threshold after a collection phase, the collector will allocate + # additional memory to the arena such that it is no longer starving. + global_arena_starvation_threshold::Int + + # The initial size of the global arena, in bytes. + global_arena_initial_size::Int + + # The point at which a local arena is deemed to be starving, i.e., + # it no longer contains enough memory to perform basic allocations. + # If a local arena's free byte count stays below the arena starvation + # threshold after a collection phase, the collector will allocate + # additional memory to the arena such that it is no longer starving. + local_arena_starvation_threshold::Int + + # The initial size of a local arena, in bytes. + local_arena_initial_size::Int +end + +# Creates a GC configuration. +function GCConfiguration(; + local_arena_count::Integer = 8, + root_buffer_capacity::Integer = 256, + global_arena_starvation_threshold::Integer = 4 * MiB, + global_arena_initial_size::Integer = 2 * MiB, + local_arena_starvation_threshold::Integer = 1 * MiB, + local_arena_initial_size::Integer = 1 * MiB) + + GCConfiguration( + local_arena_count, + root_buffer_capacity, + global_arena_starvation_threshold, + global_arena_initial_size, + local_arena_starvation_threshold, + local_arena_initial_size) +end + +function initial_heap_size(config::GCConfiguration, thread_count::Integer) + warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device())) + local_arenas_bytesize = sizeof(Ptr{LocalArena}) * config.local_arena_count + safepoint_bytesize = sizeof(SafepointState) * warp_count + fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count + rootbuf_bytesize = sizeof(ObjectRef) * config.root_buffer_capacity * thread_count + + result = 0 + result += local_arenas_bytesize + result += safepoint_bytesize + result += fingerbuf_bytesize + result += rootbuf_bytesize + result += config.local_arena_count * config.local_arena_initial_size + result += config.global_arena_initial_size + return result +end + # Initializes a GC heap and produces a master record. function gc_init!( heap::GCHeapDescription, - thread_count::Integer; - warp_count::Union{Integer, Nothing} = nothing, - root_buffer_capacity::Integer = default_root_buffer_capacity, - local_arena_count::Integer = 8)::GCMasterRecord + config::GCConfiguration, + thread_count::Integer)::GCMasterRecord - if warp_count == nothing - warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device())) - end + warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device())) master_region = heap.regions[1] @@ -1196,7 +1232,7 @@ function gc_init!( gc_memory_end_ptr = master_region.start + master_region.size # Allocate a local arena pointer buffer. - local_arenas_bytesize = sizeof(Ptr{LocalArena}) * local_arena_count + local_arenas_bytesize = sizeof(Ptr{LocalArena}) * config.local_arena_count local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{LocalArena}}, gc_memory_start_ptr) # Allocate the safepoint flag buffer. @@ -1206,12 +1242,12 @@ function gc_init!( # Allocate root buffers. fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, safepoint_ptr + fingerbuf_bytesize) - rootbuf_bytesize = sizeof(ObjectRef) * root_buffer_capacity * thread_count + rootbuf_bytesize = sizeof(ObjectRef) * config.root_buffer_capacity * thread_count rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, fingerbuf_ptr + fingerbuf_bytesize) # Populate the root buffer fingers. for i in 1:thread_count - unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * root_buffer_capacity, i) + unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * config.root_buffer_capacity, i) end # Compute a pointer to the start of the tiny arena. @@ -1226,10 +1262,10 @@ function gc_init!( end # Set up local arenas. - for i in 1:local_arena_count - local_arena = make_gc_arena!(LocalArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold)) + for i in 1:config.local_arena_count + local_arena = make_gc_arena!(LocalArena, arena_start_ptr, Csize_t(config.local_arena_initial_size)) unsafe_store!(local_arenas_ptr, local_arena, i) - arena_start_ptr += local_arena_starvation_threshold + arena_start_ptr += config.local_arena_initial_size end # Set up the global arena. @@ -1238,8 +1274,8 @@ function gc_init!( return GCMasterRecord( warp_count, UInt32(thread_count), - root_buffer_capacity, - UInt32(local_arena_count), + UInt32(config.root_buffer_capacity), + UInt32(config.local_arena_count), arena_for_ants, local_arenas_ptr, global_arena, @@ -1659,7 +1695,7 @@ end # Collects garbage. This function is designed to be called by the host, # not by the device. -function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, report::GCReport) +function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, config::GCConfiguration, report::GCReport) poll_time = Base.@elapsed begin # First off, we have to wait for all warps to reach a safepoint. Clear # safepoint flags and wait for warps to set them again. @@ -1748,9 +1784,9 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, # limit then we'll expand the GC heap and add the additional memory # to the arena's free list. threshold = if arena == master_record.global_arena - global_arena_starvation_threshold + config.global_arena_starvation_threshold else - local_arena_starvation_threshold + config.local_arena_starvation_threshold end if free_memory < threshold @@ -1822,6 +1858,9 @@ macro cuda_gc(ex...) # Get the total number of threads. thread_count = get_kwarg_or_default(call_kwargs, :threads, 1) + # Get the GC configuration. + config = get_kwarg_or_default(env_kwargs, :gc_config, GCConfiguration()) + # convert the arguments, call the compiler and launch the kernel # while keeping the original arguments alive push!(code.args, @@ -1831,11 +1870,14 @@ macro cuda_gc(ex...) local host_interrupt_array = alloc_shared_array((1,), ready) local device_interrupt_buffer = get_shared_device_buffer(host_interrupt_array) + # Evaluate the GC configuration. + local gc_config = $(esc(config)) + # Allocate a shared buffer for GC memory. - local gc_memory_size = initial_gc_heap_size + sizeof(ObjectRef) * default_root_buffer_capacity * $(esc(thread_count)) + local gc_memory_size = initial_heap_size(gc_config, $(esc(thread_count))) local gc_heap = GCHeapDescription() expand!(gc_heap, gc_memory_size) - local master_record = gc_init!(gc_heap, $(esc(thread_count))) + local master_record = gc_init!(gc_heap, gc_config, $(esc(thread_count))) # Define a kernel initialization function. local function kernel_init(kernel) @@ -1866,7 +1908,7 @@ macro cuda_gc(ex...) local gc_report = GCReport() local function handle_interrupt() - gc_collect_impl(master_record, gc_heap, gc_report) + gc_collect_impl(master_record, gc_heap, gc_config, gc_report) end try From 757520451924a20de3f8c98bfc820167462c3e0c Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 10 May 2019 11:04:17 +0200 Subject: [PATCH 111/146] Make genetic algo, ssa opt benchmarks quicker --- gc-benchmarks/genetic-algorithm.jl | 2 +- gc-benchmarks/ssa-opt.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gc-benchmarks/genetic-algorithm.jl b/gc-benchmarks/genetic-algorithm.jl index b4e3aa8a..6484226d 100644 --- a/gc-benchmarks/genetic-algorithm.jl +++ b/gc-benchmarks/genetic-algorithm.jl @@ -131,7 +131,7 @@ function genetic_algo(seed::Int)::Character end # Run the genetic algorithm for a few iterations. - for j in 1:10 + for j in 1:2 individuals = step(individuals, generator) end diff --git a/gc-benchmarks/ssa-opt.jl b/gc-benchmarks/ssa-opt.jl index b7c10238..a9a83acd 100644 --- a/gc-benchmarks/ssa-opt.jl +++ b/gc-benchmarks/ssa-opt.jl @@ -86,7 +86,7 @@ end const thread_count = 256 function kernel() - block = create_range_sum_block(50) + block = create_range_sum_block(25) fold_constants(block) return end From fc7273729bbf874cff80f8bd5aa003bbb83e7d7d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 10 May 2019 11:04:58 +0200 Subject: [PATCH 112/146] Try two GC configs when running benchmarks --- gc-benchmarks/run-all.jl | 8 +++++--- gc-benchmarks/utils.jl | 12 ++++++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index c7bb7083..10d99f09 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -20,13 +20,15 @@ println(results) # Also write them to a CSV for further analysis. open("results.csv", "w") do file - write(file, "benchmark,nogc,gc,ratio\n") + write(file, "benchmark,nogc,gc,gc-shared,nogc-ratio,gc-ratio,gc-shared-ratio\n") for key in sort([k for k in keys(results)]) runs = results[key] median_times = BenchmarkTools.median(runs) gc_time = median_times["gc"].time / 1e6 + gc_shared_time = median_times["gc-shared"].time / 1e6 nogc_time = median_times["nogc"].time / 1e6 - ratio = gc_time / nogc_time - write(file, "$key,$nogc_time,$gc_time,$ratio\n") + gc_ratio = gc_time / nogc_time + gc_shared_ratio = gc_shared_time / nogc_time + write(file, "$key,$nogc_time,$gc_time,$gc_shared_time,1,$gc_ratio,$gc_shared_ratio\n") end end diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index aa0df174..701ae891 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -42,7 +42,7 @@ end macro cuda_sync(args...) esc(quote if should_use_gc() - CUDAnative.@cuda_gc $(args...) + CUDAnative.@cuda_gc gc_config=gc_config $(args...) else @sync CUDAnative.@cuda $(args...) end @@ -55,11 +55,19 @@ function register_cuda_benchmark(f, name, config) suite[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90 end +const MiB = 1 << 20 + macro cuda_benchmark(name, ex) esc(quote - suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "nogc"]) + suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "gc-shared", "nogc"]) register_cuda_benchmark($name, "gc") do global use_gc = true + global gc_config = GCConfiguration(local_arena_count=8, local_arena_initial_size=MiB, global_arena_initial_size=2 * MiB) + $(ex) + end + register_cuda_benchmark($name, "gc-shared") do + global use_gc = true + global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=10 * MiB) $(ex) end register_cuda_benchmark($name, "nogc") do From 573d580adc5cf511e5041b078de23f0aaedc1773 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 10 May 2019 11:23:20 +0200 Subject: [PATCH 113/146] Fold '@cuda_gc' into '@cuda' --- examples/binary-tree.jl | 12 ++-- examples/gc.jl | 2 +- examples/linked-list.jl | 6 +- examples/matrix.jl | 4 +- examples/stdlib-array.jl | 2 +- gc-benchmarks/utils.jl | 2 +- src/execution.jl | 104 +++++++++++++++++++++++++++++----- src/gc.jl | 117 +-------------------------------------- test/device/gc.jl | 4 +- 9 files changed, 109 insertions(+), 144 deletions(-) diff --git a/examples/binary-tree.jl b/examples/binary-tree.jl index 46db7d38..812af535 100644 --- a/examples/binary-tree.jl +++ b/examples/binary-tree.jl @@ -9,7 +9,7 @@ import Base: haskey, insert! # The main point of this example is to demonstrate that even # naive, pointer-chasing programs can be compiled to GPU kernels. -const use_gc = true +const use_gc = false """A binary search tree node.""" abstract type BinarySearchTreeNode{T} end @@ -136,6 +136,8 @@ function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64}) return end +ccall((:ha_init_bytes, "/media/jonathan/Quark/School/CUDAnative.jl/libhalloc"), Cvoid, (Csize_t,), Csize_t(256 * 1024 * 1024)) + # Generate a sequence of 64-bit truncated Fibonacci numbers. number_set = fibonacci(Int64, number_count) # Randomize the sequence's order. @@ -156,18 +158,18 @@ Mem.upload!(destination_array, test_sequence) if use_gc # Run the kernel. - @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) + @cuda gc=true malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer) # Run it again. Mem.upload!(destination_array, test_sequence) - stats = @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) + stats = @cuda gc=true malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer) else # Run the kernel. - @cuda threads=thread_count kernel(source_pointer, destination_pointer) + @cuda malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer) # Run it again and time it this time. Mem.upload!(destination_array, test_sequence) - stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer) + stats = CUDAdrv.@elapsed @cuda malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer) end println(stats) diff --git a/examples/gc.jl b/examples/gc.jl index 51fe758e..6e81bfb2 100644 --- a/examples/gc.jl +++ b/examples/gc.jl @@ -42,6 +42,6 @@ Mem.upload!(source_array, fill(42.f0, thread_count)) Mem.upload!(destination_array, zeros(Float32, thread_count)) # Run the kernel. -@cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) +@cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer) @test Mem.download(Float32, destination_array, thread_count) == fill(42.f0, thread_count) diff --git a/examples/linked-list.jl b/examples/linked-list.jl index 2c7e949c..8e2c7f3a 100644 --- a/examples/linked-list.jl +++ b/examples/linked-list.jl @@ -45,7 +45,7 @@ function sum(list::List{T}) where T reduce(+, list; init=zero(T)) end -const element_count = 1000 +const element_count = 2000 const thread_count = 32 function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64}) @@ -67,8 +67,8 @@ Mem.upload!(destination_array, zeros(Int64, thread_count)) # Run the kernel. if use_gc - @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) - stats = @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) + @cuda gc=true threads=thread_count gc_config=GCConfiguration(; global_arena_initial_size=1024, global_arena_starvation_threshold=1024) kernel(source_pointer, destination_pointer) + stats = @cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer) else @cuda threads=thread_count kernel(source_pointer, destination_pointer) stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer) diff --git a/examples/matrix.jl b/examples/matrix.jl index 277aacd1..69fa73d8 100644 --- a/examples/matrix.jl +++ b/examples/matrix.jl @@ -121,9 +121,9 @@ destination_array = Mem.alloc(Int64, thread_count) destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) if use_gc - time = @cuda_gc threads=thread_count kernel(destination_pointer) + time = @cuda gc=true threads=thread_count kernel(destination_pointer) println(time) - time = @cuda_gc threads=thread_count kernel(destination_pointer) + time = @cuda gc=true threads=thread_count kernel(destination_pointer) println(time) else time = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer) diff --git a/examples/stdlib-array.jl b/examples/stdlib-array.jl index b5b17cc2..157a468f 100644 --- a/examples/stdlib-array.jl +++ b/examples/stdlib-array.jl @@ -17,4 +17,4 @@ function kernel() return end -@cuda_gc threads=thread_count kernel() +@cuda gc=true threads=thread_count kernel() diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index 701ae891..73d359ed 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -42,7 +42,7 @@ end macro cuda_sync(args...) esc(quote if should_use_gc() - CUDAnative.@cuda_gc gc_config=gc_config $(args...) + CUDAnative.@cuda gc=true gc_config=gc_config $(args...) else @sync CUDAnative.@cuda $(args...) end diff --git a/src/execution.jl b/src/execution.jl index fc930c7c..4c1337f6 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -9,7 +9,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize # the code it generates, or the execution function split_kwargs(kwargs) macro_kws = [:dynamic, :init, :gc_config] - compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :malloc] + compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :malloc, :gc] call_kws = [:cooperative, :blocks, :threads, :shmem, :stream] macro_kwargs = [] compiler_kwargs = [] @@ -90,6 +90,9 @@ performed, scheduling a kernel launch on the current CUDA context. Several keyword arguments are supported that influence the behavior of `@cuda`. - `dynamic`: use dynamic parallelism to launch device-side kernels +- `gc`: set up a GC and use it to allocate memory; cannot be combined with `dynamic` +- `gc_config`: the GC configuration to use if `gc=true`; see [`GCConfiguration`](@ref) +- `malloc`: the name of the allocation function to use, if `gc` is not in use - arguments that influence kernel compilation: see [`cufunction`](@ref) and [`dynamic_cufunction`](@ref) - arguments that influence kernel launch: see [`CUDAnative.HostKernel`](@ref) and @@ -133,21 +136,15 @@ macro cuda(ex...) args = call.args[2:end] code = quote end - macro_kwargs, compiler_kwargs, call_kwargs = split_kwargs(kwargs) + env_kwargs, compiler_kwargs, call_kwargs = split_kwargs(kwargs) vars, var_exprs = assign_args!(code, args) # handle keyword arguments that influence the macro's behavior - dynamic = false - env_kwargs = [] - for kwarg in macro_kwargs - key,val = kwarg.args - if key == :dynamic - isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant value")) - dynamic = val::Bool - else - push!(env_kwargs, kwarg) - end - end + dynamic = get_kwarg_or_default(env_kwargs, :dynamic, false) + isa(dynamic, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant Boolean")) + + gc = get_kwarg_or_default(compiler_kwargs, :gc, false) + isa(gc, Bool) || throw(ArgumentError("`gc` keyword argument to @cuda should be a constant Boolean")) if dynamic # FIXME: we could probably somehow support kwargs with constant values by either @@ -155,6 +152,9 @@ macro cuda(ex...) # IR when processing the dynamic parallelism marker isempty(compiler_kwargs) || error("@cuda dynamic parallelism does not support compiler keyword arguments") + # FIXME: update the GC to support dynamic parallelism somehow. + !gc || error("@cuda does not support both `gc=true` and `dynamic=true`") + # dynamic, device-side kernel launch push!(code.args, quote @@ -164,6 +164,84 @@ macro cuda(ex...) prepare_kernel(kernel; $(map(esc, env_kwargs)...)) kernel($(var_exprs...); $(map(esc, call_kwargs)...)) end) + elseif gc + # Find the stream on which the kernel is to be scheduled. + stream = get_kwarg_or_default(call_kwargs, :stream, CuDefaultStream()) + + # Get the total number of threads. + thread_count = get_kwarg_or_default(call_kwargs, :threads, 1) + + # Get the GC configuration. + config = get_kwarg_or_default(env_kwargs, :gc_config, GCConfiguration()) + + # GC-enabled host-side launch. + push!(code.args, + quote + GC.@preserve $(vars...) begin + # Define a trivial buffer that contains the interrupt state. + local host_interrupt_array = alloc_shared_array((1,), ready) + local device_interrupt_buffer = get_shared_device_buffer(host_interrupt_array) + + # Evaluate the GC configuration. + local gc_config = $(esc(config)) + + # Allocate a shared buffer for GC memory. + local gc_memory_size = initial_heap_size(gc_config, $(esc(thread_count))) + local gc_heap = GCHeapDescription() + expand!(gc_heap, gc_memory_size) + local master_record = gc_init!(gc_heap, gc_config, $(esc(thread_count))) + + # Define a kernel initialization function. + local function kernel_init(kernel) + # Set the interrupt state pointer. + try + global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer") + set(global_handle, CuPtr{UInt32}(device_interrupt_buffer.ptr)) + catch exception + # The interrupt pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end + + # Set the GC master record. + try + global_handle = CuGlobal{GCMasterRecord}(kernel.mod, "gc_master_record") + set(global_handle, master_record) + catch exception + # The GC info pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end + end + + local gc_report = GCReport() + local function handle_interrupt() + gc_collect_impl(master_record, gc_heap, gc_config, gc_report) + end + + try + # Standard kernel setup logic. + local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),)) + local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} + local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; malloc="ptx_gc_malloc", $(map(esc, compiler_kwargs)...)) + CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...)) + gc_report.elapsed_time = Base.@elapsed begin + kernel(kernel_args...; $(map(esc, call_kwargs)...)) + + # Handle interrupts. + handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream))) + end + finally + free_shared_array(host_interrupt_array) + free!(gc_heap) + end + gc_report + end + end) else # regular, host-side kernel launch # diff --git a/src/gc.jl b/src/gc.jl index a116a687..2c3763f5 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -43,7 +43,7 @@ # * When the device runs out of GC memory, it requests an interrupt # to mark and sweep. -export @cuda_gc, gc_malloc, gc_malloc_object, gc_collect, gc_safepoint, GCConfiguration +export gc_malloc, gc_malloc_object, gc_collect, gc_safepoint, GCConfiguration import Base: length, show import Printf: @sprintf @@ -1817,118 +1817,3 @@ function get_kwarg_or_default(kwarg_list, key::Symbol, default) end return default end - -""" - @cuda_gc [kwargs...] func(args...) - -High-level interface for executing code on a GPU with GC support. -The `@cuda_gc` macro should prefix a call, with `func` a callable function -or object that should return nothing. It will be compiled to a CUDA function upon first -use, and to a certain extent arguments will be converted and managed automatically using -`cudaconvert`. Next, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel -launch on the current CUDA context. Finally, `@cuda_gc` waits for the kernel to finish, -performing garbage collection in the meantime if necessary. - -Several keyword arguments are supported that influence kernel compilation and execution. For -more information, refer to the documentation of respectively [`cufunction`](@ref) and -[`CUDAnative.Kernel`](@ref). -""" -macro cuda_gc(ex...) - # destructure the `@cuda_gc` expression - if length(ex) > 0 && ex[1].head == :tuple - error("The tuple argument to @cuda has been replaced by keywords: `@cuda_gc threads=... fun(args...)`") - end - call = ex[end] - kwargs = ex[1:end-1] - - # destructure the kernel call - if call.head != :call - throw(ArgumentError("second argument to @cuda_gc should be a function call")) - end - f = call.args[1] - args = call.args[2:end] - - code = quote end - env_kwargs, compiler_kwargs, call_kwargs = CUDAnative.split_kwargs(kwargs) - vars, var_exprs = CUDAnative.assign_args!(code, args) - - # Find the stream on which the kernel is to be scheduled. - stream = get_kwarg_or_default(call_kwargs, :stream, CuDefaultStream()) - - # Get the total number of threads. - thread_count = get_kwarg_or_default(call_kwargs, :threads, 1) - - # Get the GC configuration. - config = get_kwarg_or_default(env_kwargs, :gc_config, GCConfiguration()) - - # convert the arguments, call the compiler and launch the kernel - # while keeping the original arguments alive - push!(code.args, - quote - GC.@preserve $(vars...) begin - # Define a trivial buffer that contains the interrupt state. - local host_interrupt_array = alloc_shared_array((1,), ready) - local device_interrupt_buffer = get_shared_device_buffer(host_interrupt_array) - - # Evaluate the GC configuration. - local gc_config = $(esc(config)) - - # Allocate a shared buffer for GC memory. - local gc_memory_size = initial_heap_size(gc_config, $(esc(thread_count))) - local gc_heap = GCHeapDescription() - expand!(gc_heap, gc_memory_size) - local master_record = gc_init!(gc_heap, gc_config, $(esc(thread_count))) - - # Define a kernel initialization function. - local function kernel_init(kernel) - # Set the interrupt state pointer. - try - global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer") - set(global_handle, CuPtr{UInt32}(device_interrupt_buffer.ptr)) - catch exception - # The interrupt pointer may not have been declared (because it is unused). - # In that case, we should do nothing. - if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code - rethrow() - end - end - - # Set the GC master record. - try - global_handle = CuGlobal{GCMasterRecord}(kernel.mod, "gc_master_record") - set(global_handle, master_record) - catch exception - # The GC info pointer may not have been declared (because it is unused). - # In that case, we should do nothing. - if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code - rethrow() - end - end - end - - local gc_report = GCReport() - local function handle_interrupt() - gc_collect_impl(master_record, gc_heap, gc_config, gc_report) - end - - try - # Standard kernel setup logic. - local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),)) - local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} - local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; gc = true, malloc="ptx_gc_malloc", $(map(esc, compiler_kwargs)...)) - CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...)) - gc_report.elapsed_time = Base.@elapsed begin - kernel(kernel_args...; $(map(esc, call_kwargs)...)) - - # Handle interrupts. - handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream))) - end - finally - free_shared_array(host_interrupt_array) - free!(gc_heap) - end - gc_report - end - end) - return code -end diff --git a/test/device/gc.jl b/test/device/gc.jl index 1ec9b0fc..640d5ebf 100644 --- a/test/device/gc.jl +++ b/test/device/gc.jl @@ -6,7 +6,7 @@ dummy() = return dummy_handler(kernel) = return -@testset "@cuda_gc" begin +@testset "@cuda gc=true" begin @testset "allocate and collect" begin # This test allocates many very small and very large objects. Both the small @@ -60,7 +60,7 @@ dummy_handler(kernel) = return Mem.upload!(destination_array, zeros(Float32, thread_count)) # Run the kernel. - @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer) + @cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer) @test Mem.download(Float32, destination_array, thread_count) == fill(42.f0, thread_count) end From 9ac081d40349f1f577d63355e1e4d397e4b5ca41 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 10 May 2019 12:26:08 +0200 Subject: [PATCH 114/146] Reuse pinned memory support from CUDAdrv --- src/execution.jl | 11 +++++----- src/gc.jl | 24 +++++++-------------- src/interrupts.jl | 53 ++++++----------------------------------------- 3 files changed, 19 insertions(+), 69 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index 4c1337f6..374a232c 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -179,8 +179,9 @@ macro cuda(ex...) quote GC.@preserve $(vars...) begin # Define a trivial buffer that contains the interrupt state. - local host_interrupt_array = alloc_shared_array((1,), ready) - local device_interrupt_buffer = get_shared_device_buffer(host_interrupt_array) + local interrupt_buffer = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, sizeof(ready), CUDAdrv.Mem.HOSTALLOC_DEVICEMAP) + unsafe_store!(Base.unsafe_convert(Ptr{UInt32}, interrupt_buffer), ready) + local device_interrupt_pointer = Base.unsafe_convert(CuPtr{UInt32}, interrupt_buffer) # Evaluate the GC configuration. local gc_config = $(esc(config)) @@ -196,7 +197,7 @@ macro cuda(ex...) # Set the interrupt state pointer. try global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer") - set(global_handle, CuPtr{UInt32}(device_interrupt_buffer.ptr)) + set(global_handle, device_interrupt_pointer) catch exception # The interrupt pointer may not have been declared (because it is unused). # In that case, we should do nothing. @@ -233,10 +234,10 @@ macro cuda(ex...) kernel(kernel_args...; $(map(esc, call_kwargs)...)) # Handle interrupts. - handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream))) + handle_interrupts(handle_interrupt, pointer(interrupt_buffer), $(esc(stream))) end finally - free_shared_array(host_interrupt_array) + CUDAdrv.Mem.free(interrupt_buffer) free!(gc_heap) end gc_report diff --git a/src/gc.jl b/src/gc.jl index 2c3763f5..7bc4f0b8 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -1135,16 +1135,7 @@ const MiB = 1 << 20 const tiny_arena_starvation_threshold = 0 # 2 * MiB # A description of a region of memory that has been allocated to the GC heap. -struct GCHeapRegion - # A buffer that contains the GC region's bytes. - buffer::Array{UInt8, 1} - # A pointer to the first element in the region. - start::Ptr{UInt8} - # The region's size in bytes. - size::Csize_t -end - -GCHeapRegion(buffer::Array{UInt8, 1}) = GCHeapRegion(buffer, pointer(buffer, 1), Csize_t(length(buffer))) +const GCHeapRegion = CUDAdrv.Mem.HostBuffer # A description of all memory that has been allocated to the GC heap. struct GCHeapDescription @@ -1228,8 +1219,8 @@ function gc_init!( master_region = heap.regions[1] - gc_memory_start_ptr = master_region.start - gc_memory_end_ptr = master_region.start + master_region.size + gc_memory_start_ptr = pointer(master_region) + gc_memory_end_ptr = pointer(master_region) + sizeof(master_region) # Allocate a local arena pointer buffer. local_arenas_bytesize = sizeof(Ptr{LocalArena}) * config.local_arena_count @@ -1397,7 +1388,7 @@ end # Tells if a GC heap contains a particular pointer. function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T for region in heap.regions - if pointer >= region.start && pointer < region.start + region.size + if pointer >= pointer(region) && pointer < pointer(region) + sizeof(region) return true end end @@ -1408,8 +1399,7 @@ end # the list of allocated regions. `size` describes the amount of bytes to # allocate. Returns the allocated region. function expand!(heap::GCHeapDescription, size::Integer)::GCHeapRegion - buffer = alloc_shared_array((size,), UInt8(0)) - region = GCHeapRegion(buffer) + region = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, size, CUDAdrv.Mem.HOSTALLOC_DEVICEMAP) push!(heap.regions, region) return region end @@ -1417,7 +1407,7 @@ end # Frees all memory allocated by a GC heap. function free!(heap::GCHeapDescription) for region in heap.regions - free_shared_array(region.buffer) + CUDAdrv.Mem.free(region) end end @@ -1646,7 +1636,7 @@ end # Expands a GC arena by assigning it an additional heap region. function gc_expand(arena::Ptr{FreeListArena}, region::GCHeapRegion) - extra_record = make_gc_block!(region.start, region.size) + extra_record = make_gc_block!(pointer(region), sizeof(region)) last_free_list_ptr = @get_field_pointer(arena, :free_list_head) iterate_free(arena) do record last_free_list_ptr = @get_field_pointer(record, :next) diff --git a/src/interrupts.jl b/src/interrupts.jl index de7cc7cb..fb3076dd 100644 --- a/src/interrupts.jl +++ b/src/interrupts.jl @@ -9,48 +9,6 @@ import CUDAdrv: @apicall export @cuda_interruptible, interrupt, interrupt_or_wait, wait_for_interrupt -# Allocates an array of host memory that is page-locked and accessible -# to the device. Maps the allocation into the CUDA address space. -# Returns a host array that can be turned into a device array by calling -# the `get_shared_device_buffer` function. -function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T)::Array{T, N} where {T, N} - # Allocate memory that is accessible to both the host and the device. - bytesize = prod(dims) * sizeof(T) - ptr_ref = Ref{Ptr{Cvoid}}() - @apicall( - :cuMemAllocHost, - (Ptr{Ptr{Cvoid}}, Csize_t), - ptr_ref, bytesize) - - # Wrap the memory in an array. - host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false) - - # Initialize the array's contents. - fill!(host_array, init) - - return host_array -end - -# Gets the device array that corresponds to a shared host array. -# NOTE: this function only works for arrays that were allocated by -# `alloc_shared_array`. It has undefined behavior for all other arrays. -function get_shared_device_buffer(shared_array::Array{T, N})::Mem.Buffer where {T, N} - bytesize = length(shared_array) * sizeof(T) - CUDAdrv.Mem.Buffer( - convert(CuPtr{T}, convert(Csize_t, pointer(shared_array, 1))), - bytesize, - CuCurrentContext()) -end - -# Frees an array of host memory. -function free_shared_array(shared_array::Array{T, N}) where {T, N} - ptr = pointer(shared_array, 1) - @apicall( - :cuMemFreeHost, - (Ptr{Cvoid},), - ptr) -end - # Queries a stream for its status. function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint return ccall( @@ -255,8 +213,9 @@ macro cuda_interruptible(handler, ex...) quote GC.@preserve $(vars...) begin # Define a trivial buffer that contains the interrupt state. - local host_array = alloc_shared_array((1,), ready) - local device_buffer = get_shared_device_buffer(host_array) + local interrupt_buffer = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, sizeof(ready), CUDAdrv.Mem.HOSTALLOC_DEVICEMAP) + unsafe_store!(Base.unsafe_convert(Ptr{UInt32}, interrupt_buffer), ready) + local device_interrupt_pointer = Base.unsafe_convert(CuPtr{UInt32}, interrupt_buffer) try # Define a kernel initialization function that sets the @@ -264,7 +223,7 @@ macro cuda_interruptible(handler, ex...) local function interrupt_kernel_init(kernel) try global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer") - set(global_handle, CuPtr{UInt32}(device_buffer.ptr)) + set(global_handle, device_interrupt_pointer) catch exception # The interrupt pointer may not have been declared (because it is unused). # In that case, we should do nothing. @@ -282,9 +241,9 @@ macro cuda_interruptible(handler, ex...) kernel(kernel_args...; $(map(esc, call_kwargs)...)) # Handle interrupts. - handle_interrupts($(esc(handler)), pointer(host_array, 1), $(esc(stream))) + handle_interrupts($(esc(handler)), pointer(interrupt_buffer), $(esc(stream))) finally - free_shared_array(host_array) + CUDAdrv.Mem.free(interrupt_buffer) end end end) From 61818e8b1fd9606d21e6ab91b04faad395806520 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 10 May 2019 15:18:40 +0200 Subject: [PATCH 115/146] Handle multi-dimensional 'thread' args gracefully --- src/execution.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index 8f3816c0..bb66f4c5 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -188,10 +188,10 @@ macro cuda(ex...) local gc_config = $(esc(config)) # Allocate a shared buffer for GC memory. - local gc_memory_size = initial_heap_size(gc_config, $(esc(thread_count))) + local gc_memory_size = initial_heap_size(gc_config, prod($(esc(thread_count)))) local gc_heap = GCHeapDescription() expand!(gc_heap, gc_memory_size) - local master_record = gc_init!(gc_heap, gc_config, $(esc(thread_count))) + local master_record = gc_init!(gc_heap, gc_config, prod($(esc(thread_count)))) # Define a kernel initialization function. local function kernel_init(kernel) From 35a3652e19475129c75382747ded9c02099ea5da Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 23 May 2019 11:57:03 +0200 Subject: [PATCH 116/146] Define 'upload!', 'download' benchmark utils --- gc-benchmarks/utils.jl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index 73d359ed..83cfacef 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -106,3 +106,13 @@ function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::I end end + +function upload!(destination, source) + Mem.copy!(destination, pointer(source), sizeof(source)) +end + +function download(::Type{T}, source, dims) where T + result = Array{T}(undef, dims) + Mem.copy!(pointer(result), source, sizeof(result)) + result +end From 09edf89d94ac314155096ef2f5f8fa51b9ccd5a7 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 23 May 2019 12:47:49 +0200 Subject: [PATCH 117/146] Implement a bump allocator for kernels --- src/device/runtime.jl | 44 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index df3b3d13..94b1abcd 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -12,6 +12,7 @@ module Runtime using ..CUDAnative using LLVM using LLVM.Interop +using CUDAdrv import ..CUDAnative: GCFrame ## representation of a runtime method instance @@ -286,6 +287,49 @@ compile( compile(CUDAnative.gc_safepoint, Cvoid, ()) compile(CUDAnative.gc_perma_safepoint, Cvoid, ()) +## Bump allocator. + +# Allocates `bytesize` bytes of storage by bumping the global bump +# allocator pointer. +function bump_alloc(bytesize::Csize_t)::Ptr{UInt8} + ptr = CUDAnative.@cuda_global_ptr("bump_alloc_ptr", Csize_t) + chunk_address = CUDAnative.atomic_add!(ptr, bytesize) + end_ptr = unsafe_load(CUDAnative.@cuda_global_ptr("bump_alloc_end", Csize_t)) + if chunk_address < end_ptr + return Ptr{UInt8}(chunk_address) + else + return C_NULL + end +end + +compile(bump_alloc, Ptr{UInt8}, (Csize_t,)) + +function maybe_set_global(kernel, name, value::T) where T + try + global_handle = CuGlobal{T}(kernel.mod, name) + set(global_handle, value) + catch exception + # The interrupt pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end +end + +function bump_alloc_init!(kernel, capacity) + buf = Mem.alloc(Mem.DeviceBuffer, capacity) + start_address = pointer(buf) + end_address = start_address + capacity + maybe_set_global(kernel, "bump_alloc_ptr", start_address) + maybe_set_global(kernel, "bump_alloc_end", end_address) + return start_address +end + +function bump_alloc_finalize!(kernel, ptr) + Mem.free(ptr) +end + ## Arrays # A data structure that carefully mirrors an in-memory array control From 60d6fc6de211cedce7ff18ace5b0305d0e4431b7 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 23 May 2019 12:49:34 +0200 Subject: [PATCH 118/146] Add a bump allocator to the GC benchmark configs --- gc-benchmarks/utils.jl | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index 83cfacef..f3db300e 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -1,10 +1,10 @@ import BenchmarkTools -function should_use_gc() +function get_gc_mode() try - return use_gc + return gc_mode catch ex - return true + return "gc" end end @@ -41,8 +41,11 @@ end macro cuda_sync(args...) esc(quote - if should_use_gc() + local mode = get_gc_mode() + if mode == "gc" CUDAnative.@cuda gc=true gc_config=gc_config $(args...) + elseif mode == "bump" + @sync CUDAnative.@cuda init=(k -> CUDAnative.Runtime.bump_alloc_init!(k, 60 * MiB)) malloc="ptx_bump_alloc" $(args...) else @sync CUDAnative.@cuda $(args...) end @@ -52,26 +55,30 @@ end suite = BenchmarkTools.BenchmarkGroup() function register_cuda_benchmark(f, name, config) - suite[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90 + suite[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 end const MiB = 1 << 20 macro cuda_benchmark(name, ex) esc(quote - suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "gc-shared", "nogc"]) + suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "gc-shared", "nogc", "bump"]) register_cuda_benchmark($name, "gc") do - global use_gc = true + global gc_mode = "gc" global gc_config = GCConfiguration(local_arena_count=8, local_arena_initial_size=MiB, global_arena_initial_size=2 * MiB) $(ex) end register_cuda_benchmark($name, "gc-shared") do - global use_gc = true + global gc_mode = "gc" global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=10 * MiB) $(ex) end register_cuda_benchmark($name, "nogc") do - global use_gc = false + global gc_mode = "nogc" + $(ex) + end + register_cuda_benchmark($name, "bump") do + global gc_mode = "bump" $(ex) end end) From 1805d7fa6aeb2025ffa020e8f80675ffdb422d2e Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 23 May 2019 14:13:25 +0200 Subject: [PATCH 119/146] Use 'managed_malloc' to implement 'gc_pool_alloc' --- src/device/runtime.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 94b1abcd..8df26f9c 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -156,7 +156,7 @@ end end function gc_pool_alloc(sz::Csize_t) - ptr = malloc(sz) + ptr = managed_malloc(sz) if ptr == C_NULL @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) throw(OutOfMemoryError()) From d99ed4a87e7b088324dedfdf1e78d0e40e520cd6 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 23 May 2019 14:21:49 +0200 Subject: [PATCH 120/146] Update test runner to write bump allocator results --- gc-benchmarks/run-all.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 10d99f09..5a12b676 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -20,15 +20,17 @@ println(results) # Also write them to a CSV for further analysis. open("results.csv", "w") do file - write(file, "benchmark,nogc,gc,gc-shared,nogc-ratio,gc-ratio,gc-shared-ratio\n") + write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n") for key in sort([k for k in keys(results)]) runs = results[key] median_times = BenchmarkTools.median(runs) gc_time = median_times["gc"].time / 1e6 gc_shared_time = median_times["gc-shared"].time / 1e6 nogc_time = median_times["nogc"].time / 1e6 + bump_time = median_times["bump"].time / 1e6 gc_ratio = gc_time / nogc_time gc_shared_ratio = gc_shared_time / nogc_time - write(file, "$key,$nogc_time,$gc_time,$gc_shared_time,1,$gc_ratio,$gc_shared_ratio\n") + bump_ratio = bump_time / nogc_time + write(file, "$key,$nogc_time,$gc_time,$gc_shared_time,$bump_time,1,$gc_ratio,$gc_shared_ratio,$bump_ratio\n") end end From c1356b1a16e35a668a58562622210505b87ff140 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 23 May 2019 14:54:50 +0200 Subject: [PATCH 121/146] Change how bump allocators are initialized --- gc-benchmarks/utils.jl | 9 ++++++++- src/device/runtime.jl | 14 +++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index f3db300e..eca5b0cf 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -45,7 +45,14 @@ macro cuda_sync(args...) if mode == "gc" CUDAnative.@cuda gc=true gc_config=gc_config $(args...) elseif mode == "bump" - @sync CUDAnative.@cuda init=(k -> CUDAnative.Runtime.bump_alloc_init!(k, 60 * MiB)) malloc="ptx_bump_alloc" $(args...) + local capacity = 60 * MiB + local buf = Mem.alloc(Mem.DeviceBuffer, capacity) + local start_address = pointer(buf) + local function init(kernel) + CUDAnative.Runtime.bump_alloc_init!(kernel, start_address, capacity) + end + @sync CUDAnative.@cuda init=init malloc="ptx_bump_alloc" $(args...) + Mem.free(buf) else @sync CUDAnative.@cuda $(args...) end diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 8df26f9c..62899ab1 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -317,17 +317,9 @@ function maybe_set_global(kernel, name, value::T) where T end end -function bump_alloc_init!(kernel, capacity) - buf = Mem.alloc(Mem.DeviceBuffer, capacity) - start_address = pointer(buf) - end_address = start_address + capacity - maybe_set_global(kernel, "bump_alloc_ptr", start_address) - maybe_set_global(kernel, "bump_alloc_end", end_address) - return start_address -end - -function bump_alloc_finalize!(kernel, ptr) - Mem.free(ptr) +function bump_alloc_init!(kernel, buffer_start, buffer_size) + maybe_set_global(kernel, "bump_alloc_ptr", buffer_start) + maybe_set_global(kernel, "bump_alloc_end", buffer_start + buffer_size) end ## Arrays From 2cdaf68eee52f86fc0d6e51110643934c46abe2c Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 6 Jun 2019 13:54:19 +0200 Subject: [PATCH 122/146] Implement jl_array_sizehint --- src/compiler/optim.jl | 2 +- src/device/runtime.jl | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index d7691907..0f42a2af 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -959,7 +959,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc) end end changed_any = true - elseif name == :jl_array_grow_end + elseif name in [:jl_array_grow_end, :jl_array_sizehint] let builder = Builder(JuliaContext()) position!(builder, call) new_call = call!(builder, Runtime.get(name), args) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 62899ab1..8f289e6d 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -406,7 +406,12 @@ function array_resize_buffer(a::Array1D, newlen::Csize_t)::Bool return true end -function jl_array_grow_at_end(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t) +""" + jl_array_grow_at(a, idx, inc, n) + +Grows array `a` containing `n` elements by `inc` elements at index `idx`. +""" +function jl_array_grow_at(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t) data = a.data elsz = Csize_t(a.elsize) reqmaxsize = a.offset + n + inc @@ -445,7 +450,7 @@ end function jl_array_grow_end(a::Array1D, inc::Csize_t) n = a.nrows - jl_array_grow_at_end(a, n, inc, n) + jl_array_grow_at(a, n, inc, n) return end @@ -456,4 +461,32 @@ compile( () -> convert(LLVMType, Cvoid), () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) +""" + jl_array_sizehint(a, sz) + +Suggest that collection `a` reserve capacity for at least `sz` elements. +""" +function jl_array_sizehint(a::Array1D, sz::Csize_t) + n = a.length + data = a.data + elsz = Csize_t(a.elsize) + reqmaxsize = a.offset + sz + if reqmaxsize > a.maxsize + newbuf = array_resize_buffer(a, reqmaxsize) + newdata = a.data + a.offset * elsz + if newbuf + memmove!(newdata, data, n * elsz) + end + a.data = data = newdata + end + return +end + +compile( + jl_array_sizehint, + Cvoid, + (Array1D, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) + end From fc809757e502992aff5dbe47327c84513b0fb6c9 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 6 Jun 2019 14:36:57 +0200 Subject: [PATCH 123/146] Implement jl_array_grow_at --- src/compiler/optim.jl | 2 +- src/device/runtime.jl | 37 ++++++++++++++++++++++++++++++------- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 0f42a2af..788f3912 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -959,7 +959,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc) end end changed_any = true - elseif name in [:jl_array_grow_end, :jl_array_sizehint] + elseif name in [:jl_array_grow_end, :jl_array_grow_at, :jl_array_sizehint] let builder = Builder(JuliaContext()) position!(builder, call) new_call = call!(builder, Runtime.get(name), args) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 8f289e6d..f0130968 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -407,19 +407,18 @@ function array_resize_buffer(a::Array1D, newlen::Csize_t)::Bool end """ - jl_array_grow_at(a, idx, inc, n) + jl_array_grow_at_impl(a, idx, inc, n) Grows array `a` containing `n` elements by `inc` elements at index `idx`. """ -function jl_array_grow_at(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t) +function jl_array_grow_at_impl(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t) data = a.data elsz = Csize_t(a.elsize) reqmaxsize = a.offset + n + inc has_gap = n > idx + nb1 = idx * elsz + nbinc = inc * elsz if reqmaxsize > a.maxsize - nb1 = idx * elsz - nbinc = inc * elsz - if reqmaxsize < 4 newmaxsize = Csize_t(4) elseif reqmaxsize >= a.maxsize * 2 @@ -439,18 +438,42 @@ function jl_array_grow_at(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t) memmove!(newdata + nb1 + nbinc, newdata + nb1, n * elsz - nb1) end a.data = data = newdata + elseif has_gap + memmove!(data + nb1 + nbinc, data + nb1, n * elsz - nb1) end newnrows = n + inc a.length = newnrows a.nrows = newnrows - zero_fill!(data + idx * elsz, inc * elsz) + zero_fill!(data + nb1, nbinc) + return +end + +""" + jl_array_grow_at(a, idx, inc) + +Grows array `a` by `inc` elements at index `idx`. +""" +function jl_array_grow_at(a::Array1D, idx::Cssize_t, inc::Csize_t) + jl_array_grow_at_impl(a, Csize_t(idx), inc, a.nrows) return end +compile( + jl_array_grow_at, + Cvoid, + (Array1D, Cssize_t, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Cssize_t), convert(LLVMType, Csize_t)]) + +""" + jl_array_grow_end(a, inc) + +Grows array `a` by `inc` elements at the end. +""" function jl_array_grow_end(a::Array1D, inc::Csize_t) n = a.nrows - jl_array_grow_at(a, n, inc, n) + jl_array_grow_at_impl(a, n, inc, n) return end From f6d7b83422faf65e9f16dbe25c584608f883595a Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 6 Jun 2019 14:49:30 +0200 Subject: [PATCH 124/146] Implement 'jl_array_grow_beg' --- src/compiler/optim.jl | 2 +- src/device/runtime.jl | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 788f3912..db230eae 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -959,7 +959,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc) end end changed_any = true - elseif name in [:jl_array_grow_end, :jl_array_grow_at, :jl_array_sizehint] + elseif name in [:jl_array_grow_at, :jl_array_grow_beg, :jl_array_grow_end, :jl_array_sizehint] let builder = Builder(JuliaContext()) position!(builder, call) new_call = call!(builder, Runtime.get(name), args) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index f0130968..4a167c2d 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -367,7 +367,7 @@ function zero_fill!(ptr::Ptr{UInt8}, count::Integer) end function memmove!(dst::Ptr{UInt8}, src::Ptr{UInt8}, sz::Integer) - if src < dst + if dst < src for i in 1:sz unsafe_store!(dst, unsafe_load(src, i), i) end @@ -376,6 +376,7 @@ function memmove!(dst::Ptr{UInt8}, src::Ptr{UInt8}, sz::Integer) unsafe_store!(dst, unsafe_load(src, i), i) end end + return end # Resize the buffer to a max size of `newlen` @@ -484,6 +485,23 @@ compile( () -> convert(LLVMType, Cvoid), () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) +""" + jl_array_grow_beg(a, inc) + +Grows array `a` by `inc` elements at the beginning of the array. +""" +function jl_array_grow_beg(a::Array1D, inc::Csize_t) + jl_array_grow_at_impl(a, Csize_t(0), inc, a.nrows) + return +end + +compile( + jl_array_grow_beg, + Cvoid, + (Array1D, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) + """ jl_array_sizehint(a, sz) From 272e77e8b6e6f906a2982c42e68c5a6709f974c9 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 6 Jun 2019 15:09:44 +0200 Subject: [PATCH 125/146] Implement array deletion methods --- src/compiler/optim.jl | 11 +++++++- src/device/runtime.jl | 58 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index db230eae..69aa9174 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -942,6 +942,15 @@ end # Lowers function calls that pertain to array operations. function lower_array_calls!(fun::LLVM.Function, malloc) changed_any = false + runtime_methods = [ + :jl_array_grow_at, + :jl_array_grow_beg, + :jl_array_grow_end, + :jl_array_del_at, + :jl_array_del_beg, + :jl_array_del_end, + :jl_array_sizehint + ] visit_literal_pointer_calls(fun) do call, name args = collect(operands(call))[1:end - 1] if name == :jl_alloc_array_1d @@ -959,7 +968,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc) end end changed_any = true - elseif name in [:jl_array_grow_at, :jl_array_grow_beg, :jl_array_grow_end, :jl_array_sizehint] + elseif name in runtime_methods let builder = Builder(JuliaContext()) position!(builder, call) new_call = call!(builder, Runtime.get(name), args) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 4a167c2d..a1a3f4ff 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -530,4 +530,62 @@ compile( () -> convert(LLVMType, Cvoid), () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) +""" + jl_array_del_at_impl(a, idx, dec, n) + +Removes a range of elements from array `a`. +""" +function jl_array_del_at_impl(a::Array1D, idx::Csize_t, dec::Csize_t, n::Csize_t) + data = a.data + elsz = a.elsize + last = idx + dec + if n > last + memmove!(data + idx * elsz, data + last * elsz, (n - last) * elsz) + end + n -= dec + if elsz == 1 + Base.unsafe_store!(data, n + 1, UInt8(0)) + end + a.nrows = n + a.length = n + return +end + +function jl_array_del_beg(a::Array1D, dec::Csize_t) + jl_array_del_at_impl(a, Csize_t(0), dec, a.nrows) + return +end + +compile( + jl_array_del_beg, + Cvoid, + (Array1D, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) + +function jl_array_del_end(a::Array1D, dec::Csize_t) + n = a.nrows + jl_array_del_at_impl(a, n, dec, n) + return +end + +compile( + jl_array_del_end, + Cvoid, + (Array1D, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) + +function jl_array_del_at(a::Array1D, idx::Cssize_t, dec::Csize_t) + jl_array_del_at_impl(a, Csize_t(idx), dec, a.nrows) + return +end + +compile( + jl_array_del_at, + Cvoid, + (Array1D, Cssize_t, Csize_t), + () -> convert(LLVMType, Cvoid), + () -> [T_prjlvalue(), convert(LLVMType, Cssize_t), convert(LLVMType, Csize_t)]) + end From 23fa152201e6f0b3efe9122f82d903dd8d236acd Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 6 Jun 2019 15:10:22 +0200 Subject: [PATCH 126/146] Create an array feature-testing benchmark --- gc-benchmarks/array-features.jl | 77 +++++++++++++++++++++++++++++++++ gc-benchmarks/run-all.jl | 1 + 2 files changed, 78 insertions(+) create mode 100644 gc-benchmarks/array-features.jl diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl new file mode 100644 index 00000000..078a0f8d --- /dev/null +++ b/gc-benchmarks/array-features.jl @@ -0,0 +1,77 @@ +module ArrayFeatures + +using CUDAdrv, CUDAnative + +# This benchmark has every thread exercise the entire low-level +# array API. + +const thread_count = 256 + +# Creates an array of Fibonacci numbers. +function fib_array(count::Integer) + result = [1, 1] + # Calls `jl_array_sizehint`. + sizehint!(result, count + 2) + for i in 1:count + # Calls `jl_array_grow_end`. + push!(result, result[i] + result[i + 1]) + end + return result +end + +function intersperse_with!(vec::Vector{T}, value::T) where T + for i in 1:length(vec) + # Calls `jl_array_grow_at`. + insert!(vec, i * 2, value) + end + return vec +end + +function manipulate_array() + # Initialize the array as a Fibonacci sequence. + arr = fib_array(20) + + # Intersperse the array with constants. + intersperse_with!(arr, 2) + + # Prepend a constant to the array (calls `jl_array_grow_beg`). + pushfirst!(arr, 2) + + # Intersperse again. + intersperse_with!(arr, 4) + + # Delete the first element (calls `jl_array_del_beg`). + popfirst!(arr) + + # Delete the last element (calls `jl_array_del_end`). + pop!(arr) + + # Delete some other element (calls `jl_array_del_at`). + deleteat!(arr, 8) + + result = 0 + for i in arr + result += i + end + return result +end + +function kernel(destination) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + unsafe_store!(destination, manipulate_array(), i) + return +end + +end + +function array_features_benchmark() + destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int) * ArrayFeatures.thread_count) + destination_pointer = Base.unsafe_convert(CuPtr{Int}, destination_array) + + # Run the kernel. + @cuda_sync threads=ArrayFeatures.thread_count ArrayFeatures.kernel(destination_pointer) + + @test download(Int, destination_array, ArrayFeatures.thread_count) == fill(ArrayFeatures.manipulate_array(), ArrayFeatures.thread_count) +end + +@cuda_benchmark "array features" array_features_benchmark() diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 5a12b676..97e3582c 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -3,6 +3,7 @@ using CUDAdrv, CUDAnative, Test include("utils.jl") include("array-expansion.jl") +include("array-features.jl") include("array-reduction.jl") include("arrays.jl") include("binary-tree.jl") From 69a9dd5e4b19b9f3766fecd596af34f7fa71b98d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 6 Jun 2019 15:13:34 +0200 Subject: [PATCH 127/146] Tweak a comment --- gc-benchmarks/array-features.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl index 078a0f8d..62441dc2 100644 --- a/gc-benchmarks/array-features.jl +++ b/gc-benchmarks/array-features.jl @@ -2,7 +2,7 @@ module ArrayFeatures using CUDAdrv, CUDAnative -# This benchmark has every thread exercise the entire low-level +# This benchmark has every thread exercise the core low-level # array API. const thread_count = 256 From 5a939f8f1f92a2825368b7fdb9a00d6052d66667 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 7 Jun 2019 18:48:14 +0200 Subject: [PATCH 128/146] Implement jl_alloc_array_2d and jl_alloc_array_3d --- gc-benchmarks/array-features.jl | 21 ++++++++++++++++----- src/compiler/optim.jl | 9 +++++++-- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl index 62441dc2..d4c1dc31 100644 --- a/gc-benchmarks/array-features.jl +++ b/gc-benchmarks/array-features.jl @@ -9,6 +9,7 @@ const thread_count = 256 # Creates an array of Fibonacci numbers. function fib_array(count::Integer) + # Calls `jl_alloc_array_1d`. result = [1, 1] # Calls `jl_array_sizehint`. sizehint!(result, count + 2) @@ -27,6 +28,14 @@ function intersperse_with!(vec::Vector{T}, value::T) where T return vec end +function iterative_sum(array) + result = 0 + for i in array + result += i + end + return result +end + function manipulate_array() # Initialize the array as a Fibonacci sequence. arr = fib_array(20) @@ -49,11 +58,13 @@ function manipulate_array() # Delete some other element (calls `jl_array_del_at`). deleteat!(arr, 8) - result = 0 - for i in arr - result += i - end - return result + # Create a two-dimensional array (calls `jl_alloc_array_2d`). + arr_2d = fill(2, (2, 2)) + + # Create a three-dimensional array (calls `jl_alloc_array_3d`). + arr_3d = fill(2, (2, 2, 2)) + + return iterative_sum(arr) + iterative_sum(arr_2d) + iterative_sum(arr_3d) end function kernel(destination) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 69aa9174..d212d69f 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -942,6 +942,11 @@ end # Lowers function calls that pertain to array operations. function lower_array_calls!(fun::LLVM.Function, malloc) changed_any = false + alloc_methods = [ + :jl_alloc_array_1d, + :jl_alloc_array_2d, + :jl_alloc_array_3d + ] runtime_methods = [ :jl_array_grow_at, :jl_array_grow_beg, @@ -953,7 +958,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc) ] visit_literal_pointer_calls(fun) do call, name args = collect(operands(call))[1:end - 1] - if name == :jl_alloc_array_1d + if name in alloc_methods is_ptr, array_type_ptr = to_literal_pointer(args[1]) if is_ptr # We can lower array creation calls if we know the type @@ -961,7 +966,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc) array_type = unsafe_pointer_to_objref(array_type_ptr) let builder = Builder(JuliaContext()) position!(builder, call) - new_array = new_array!(builder, malloc, array_type, (args[2],)) + new_array = new_array!(builder, malloc, array_type, Tuple(args[2:end])) replace_uses!(call, new_array) unsafe_delete!(LLVM.parent(call), call) dispose(builder) From 338277234fa7b5b1f3b636583ffe0c717bace48e Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 7 Jun 2019 18:56:16 +0200 Subject: [PATCH 129/146] Better document array functions --- src/device/runtime.jl | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index a1a3f4ff..dbba689a 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -410,7 +410,8 @@ end """ jl_array_grow_at_impl(a, idx, inc, n) -Grows array `a` containing `n` elements by `inc` elements at index `idx`. +Grows one-dimensional array `a` containing `n` elements by `inc` elements at +zero-based index `idx`. """ function jl_array_grow_at_impl(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t) data = a.data @@ -453,7 +454,7 @@ end """ jl_array_grow_at(a, idx, inc) -Grows array `a` by `inc` elements at index `idx`. +Grows one-dimensional array `a` by `inc` elements at zero-based index `idx`. """ function jl_array_grow_at(a::Array1D, idx::Cssize_t, inc::Csize_t) jl_array_grow_at_impl(a, Csize_t(idx), inc, a.nrows) @@ -470,7 +471,7 @@ compile( """ jl_array_grow_end(a, inc) -Grows array `a` by `inc` elements at the end. +Grows one-dimensional array `a` by `inc` elements at the end. """ function jl_array_grow_end(a::Array1D, inc::Csize_t) n = a.nrows @@ -488,7 +489,7 @@ compile( """ jl_array_grow_beg(a, inc) -Grows array `a` by `inc` elements at the beginning of the array. +Grows one-dimensional array `a` by `inc` elements at the beginning of the array. """ function jl_array_grow_beg(a::Array1D, inc::Csize_t) jl_array_grow_at_impl(a, Csize_t(0), inc, a.nrows) @@ -505,7 +506,7 @@ compile( """ jl_array_sizehint(a, sz) -Suggest that collection `a` reserve capacity for at least `sz` elements. +Suggest that one-dimensional array `a` reserve capacity for at least `sz` elements. """ function jl_array_sizehint(a::Array1D, sz::Csize_t) n = a.length @@ -533,7 +534,8 @@ compile( """ jl_array_del_at_impl(a, idx, dec, n) -Removes a range of elements from array `a`. +Removes `dec` elements from one-dimensional array `a`, starting at zero-based index `idx`. +`n` is the number of elements in `a`. """ function jl_array_del_at_impl(a::Array1D, idx::Csize_t, dec::Csize_t, n::Csize_t) data = a.data @@ -551,6 +553,11 @@ function jl_array_del_at_impl(a::Array1D, idx::Csize_t, dec::Csize_t, n::Csize_t return end +""" + jl_array_del_beg(a, dec) + +Removes `dec` elements from the beginning of one-dimensional array `a`. +""" function jl_array_del_beg(a::Array1D, dec::Csize_t) jl_array_del_at_impl(a, Csize_t(0), dec, a.nrows) return @@ -563,6 +570,11 @@ compile( () -> convert(LLVMType, Cvoid), () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) +""" + jl_array_del_end(a, dec) + +Removes `dec` elements from the end of one-dimensional array `a`. +""" function jl_array_del_end(a::Array1D, dec::Csize_t) n = a.nrows jl_array_del_at_impl(a, n, dec, n) @@ -576,6 +588,12 @@ compile( () -> convert(LLVMType, Cvoid), () -> [T_prjlvalue(), convert(LLVMType, Csize_t)]) + +""" + jl_array_del_at(a, idx, dec) + +Removes `dec` elements from one-dimensional array `a`, starting at zero-based index `idx`. +""" function jl_array_del_at(a::Array1D, idx::Cssize_t, dec::Csize_t) jl_array_del_at_impl(a, Csize_t(idx), dec, a.nrows) return From 194265986811c8fa966091b553dc1722c99c3ae2 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 10 Jun 2019 12:47:32 +0200 Subject: [PATCH 130/146] Implement jl_new_array --- gc-benchmarks/array-features.jl | 5 +++- src/compiler/optim.jl | 47 +++++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl index d4c1dc31..317e4aab 100644 --- a/gc-benchmarks/array-features.jl +++ b/gc-benchmarks/array-features.jl @@ -64,7 +64,10 @@ function manipulate_array() # Create a three-dimensional array (calls `jl_alloc_array_3d`). arr_3d = fill(2, (2, 2, 2)) - return iterative_sum(arr) + iterative_sum(arr_2d) + iterative_sum(arr_3d) + # Create a four-dimensional array (calls `jl_new_array`). + arr_4d = fill(2, (2, 2, 2, 2)) + + return iterative_sum(arr) + iterative_sum(arr_2d) + iterative_sum(arr_3d) + iterative_sum(arr_4d) end function kernel(destination) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index d212d69f..b58a2ec7 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -939,13 +939,49 @@ function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple return obj_ptr end +# Generates code that extracts array dimensions from a tuple argument. +function extract_array_dims!(builder, ::Type{Array{T, N}}, dims_tuple) where {T, N} + # First cast the tuple value to a size_t pointer in address space zero. + tuple_as_size_t = bitcast!( + builder, + addrspacecast!( + builder, + dims_tuple, + LLVM.PointerType(eltype(llvmtype(dims_tuple)))), + LLVM.PointerType(convert(LLVMType, Csize_t))) + + is_literal, ptr = to_literal_pointer(tuple_as_size_t) + + results = [] + if is_literal + # If the tuple is implemented as a literal pointer, then we want to load its elements + # ahead of time; the device won't be able to access host-allocated constants. + for i in 1:N + value = Base.unsafe_load(Base.unsafe_convert(Ptr{Csize_t}, ptr), i) + push!(results, LLVM.ConstantInt(convert(LLVMType, Csize_t), value)) + end + else + # Otherwise, generate code that loads fields from the tuple. + for i in 1:N + address = gep!( + builder, + tuple_as_size_t, + [LLVM.ConstantInt(convert(LLVMType, Int32), i)]) + + push!(results, load!(builder, address)) + end + end + return Tuple(results) +end + # Lowers function calls that pertain to array operations. function lower_array_calls!(fun::LLVM.Function, malloc) changed_any = false alloc_methods = [ :jl_alloc_array_1d, :jl_alloc_array_2d, - :jl_alloc_array_3d + :jl_alloc_array_3d, + :jl_new_array ] runtime_methods = [ :jl_array_grow_at, @@ -966,7 +1002,14 @@ function lower_array_calls!(fun::LLVM.Function, malloc) array_type = unsafe_pointer_to_objref(array_type_ptr) let builder = Builder(JuliaContext()) position!(builder, call) - new_array = new_array!(builder, malloc, array_type, Tuple(args[2:end])) + if name == :jl_new_array + # jl_new_array requires special treatment. All the other ones are + # pretty simple to handle. + dim_args = extract_array_dims!(builder, array_type, args[2]) + else + dim_args = Tuple(args[2:end]) + end + new_array = new_array!(builder, malloc, array_type, dim_args) replace_uses!(call, new_array) unsafe_delete!(LLVM.parent(call), call) dispose(builder) From 8612466ead3455c5aa60db6acd4a4731339f8d73 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 10 Jun 2019 14:09:12 +0200 Subject: [PATCH 131/146] Implement jl_ptr_to_array{,_1d} --- gc-benchmarks/array-features.jl | 14 ++++++++++++- src/compiler/optim.jl | 35 +++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl index 317e4aab..9f8cde52 100644 --- a/gc-benchmarks/array-features.jl +++ b/gc-benchmarks/array-features.jl @@ -67,7 +67,19 @@ function manipulate_array() # Create a four-dimensional array (calls `jl_new_array`). arr_4d = fill(2, (2, 2, 2, 2)) - return iterative_sum(arr) + iterative_sum(arr_2d) + iterative_sum(arr_3d) + iterative_sum(arr_4d) + # Create an alias for the Fibonacci array (this is dangerous, but we + # know what we're doing here; calls `jl_ptr_to_array_1d`). + alias = unsafe_wrap(Array, pointer(arr), length(arr)) + + # Create an alias for `arr_2d` (calls `jl_ptr_to_array`). + alias_2d = unsafe_wrap(Array, pointer(arr_2d), size(arr_2d)) + + return iterative_sum(arr) + + iterative_sum(arr_2d) + + iterative_sum(arr_3d) + + iterative_sum(arr_4d) + + iterative_sum(alias) + + iterative_sum(alias_2d) end function kernel(destination) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index b58a2ec7..bacaac0d 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -775,7 +775,7 @@ end # Emits instructions that create a new array. The array's element type # must be statically known. Its dimensions are represented as a tuple # of LLVM IR values. A pointer to the new array is returned. -function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple) +function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple; data_ptr::Union{Nothing,LLVM.Value} = nothing) # Since time immemorial, the structure of an array is (quoting from the # Julia source code here): # @@ -859,7 +859,9 @@ function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple # Actually allocate the array's contents. We will just always # use a separate buffer. Inline data storage is wasteful and # harder to implement. - data_ptr = new_bytes!(builder, malloc, data_bytesize) + if data_ptr == nothing + data_ptr = new_bytes!(builder, malloc, data_bytesize) + end # The pointer to the array's data is the first field of the struct. push!(fields, data_ptr) @@ -966,7 +968,7 @@ function extract_array_dims!(builder, ::Type{Array{T, N}}, dims_tuple) where {T, address = gep!( builder, tuple_as_size_t, - [LLVM.ConstantInt(convert(LLVMType, Int32), i)]) + [LLVM.ConstantInt(convert(LLVMType, Int32), i - 1)]) push!(results, load!(builder, address)) end @@ -983,6 +985,10 @@ function lower_array_calls!(fun::LLVM.Function, malloc) :jl_alloc_array_3d, :jl_new_array ] + wrap_methods = [ + :jl_ptr_to_array, + :jl_ptr_to_array_1d + ] runtime_methods = [ :jl_array_grow_at, :jl_array_grow_beg, @@ -1014,8 +1020,28 @@ function lower_array_calls!(fun::LLVM.Function, malloc) unsafe_delete!(LLVM.parent(call), call) dispose(builder) end + changed_any = true + end + elseif name in wrap_methods + is_ptr, array_type_ptr = to_literal_pointer(args[1]) + if is_ptr + # We can lower array wrapping calls if we know the type + # of the array to create in advance. + array_type = unsafe_pointer_to_objref(array_type_ptr) + let builder = Builder(JuliaContext()) + position!(builder, call) + if name == :jl_ptr_to_array + dim_args = extract_array_dims!(builder, array_type, args[3]) + else + dim_args = (args[3],) + end + new_array = new_array!(builder, malloc, array_type, dim_args; data_ptr=args[2]) + replace_uses!(call, new_array) + unsafe_delete!(LLVM.parent(call), call) + dispose(builder) + end + changed_any = true end - changed_any = true elseif name in runtime_methods let builder = Builder(JuliaContext()) position!(builder, call) @@ -1024,6 +1050,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc) unsafe_delete!(LLVM.parent(call), call) dispose(builder) end + changed_any = true end end return changed_any From 952a645cda8c532e6fa453804e0fe36ba6d16c42 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 11 Jun 2019 16:59:19 +0200 Subject: [PATCH 132/146] Compare GC strategies when running benchmarks --- gc-benchmarks/run-all.jl | 16 +++++++++++++- gc-benchmarks/utils.jl | 47 ++++++++++++++++++++++++++++++++-------- 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 97e3582c..46449f85 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -19,8 +19,10 @@ results = run_benchmarks() # Print the results to the terminal. println(results) +gc_tags = [t for t in benchmark_tags if startswith(t, "gc")] + # Also write them to a CSV for further analysis. -open("results.csv", "w") do file +open("strategies.csv", "w") do file write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n") for key in sort([k for k in keys(results)]) runs = results[key] @@ -35,3 +37,15 @@ open("results.csv", "w") do file write(file, "$key,$nogc_time,$gc_time,$gc_shared_time,$bump_time,1,$gc_ratio,$gc_shared_ratio,$bump_ratio\n") end end + +open("gc-heap-sizes.csv", "w") do file + ratio_tags = [t * "-ratio" for t in gc_tags] + write(file, "benchmark,$(join(gc_tags, ',')),$(join(ratio_tags, ','))\n") + for key in sort([k for k in keys(results)]) + runs = results[key] + median_times = BenchmarkTools.median(runs) + times = [median_times[t].time / 1e6 for t in gc_tags] + normalized_times = [median_times[t].time / median_times["gc"].time for t in gc_tags] + write(file, "$key,$(join(times, ',')),$(join(normalized_times, ','))\n") + end +end diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index eca5b0cf..822954a3 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -67,19 +67,48 @@ end const MiB = 1 << 20 +benchmark_tags = [ + "gc", "gc-shared", + "gc-30mb", "gc-shared-30mb", + "gc-15mb", "gc-shared-15mb", + "gc-7.5mb", "gc-shared-7.5mb", + "gc-3.75mb", "gc-shared-3.75mb", + "nogc", "bump" +] + macro cuda_benchmark(name, ex) esc(quote - suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "gc-shared", "nogc", "bump"]) - register_cuda_benchmark($name, "gc") do - global gc_mode = "gc" - global gc_config = GCConfiguration(local_arena_count=8, local_arena_initial_size=MiB, global_arena_initial_size=2 * MiB) - $(ex) + local function register_gc(config, heap_size) + register_cuda_benchmark($name, config) do + global gc_mode = "gc" + global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=heap_size) + $(ex) + end end - register_cuda_benchmark($name, "gc-shared") do - global gc_mode = "gc" - global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=10 * MiB) - $(ex) + local function register_gc_shared(config, heap_size) + register_cuda_benchmark($name, config) do + global gc_mode = "gc" + local local_arena_initial_size = div(heap_size, 10) + local global_arena_initial_size = heap_size - 8 * local_arena_initial_size + global gc_config = GCConfiguration( + local_arena_count=8, + local_arena_initial_size=local_arena_initial_size, + global_arena_initial_size=global_arena_initial_size) + $(ex) + end end + + suite[$name] = BenchmarkTools.BenchmarkGroup(benchmark_tags) + register_gc("gc", 60 * MiB) + register_gc_shared("gc-shared", 60 * MiB) + register_gc("gc-30mb", 30 * MiB) + register_gc_shared("gc-shared-30mb", 30 * MiB) + register_gc("gc-15mb", 15 * MiB) + register_gc_shared("gc-shared-15mb", 15 * MiB) + register_gc("gc-7.5mb", div(15 * MiB, 2)) + register_gc_shared("gc-shared-7.5mb", div(15 * MiB, 2)) + register_gc("gc-3.75mb", div(15 * MiB, 4)) + register_gc_shared("gc-shared-3.75mb", div(15 * MiB, 4)) register_cuda_benchmark($name, "nogc") do global gc_mode = "nogc" $(ex) From be276cf62b9a9e8bcbd02c319a63cf7ee1bb96ce Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 11 Jun 2019 17:14:57 +0200 Subject: [PATCH 133/146] Tweak array-features benchmark --- gc-benchmarks/array-features.jl | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl index 9f8cde52..c27a876a 100644 --- a/gc-benchmarks/array-features.jl +++ b/gc-benchmarks/array-features.jl @@ -74,17 +74,26 @@ function manipulate_array() # Create an alias for `arr_2d` (calls `jl_ptr_to_array`). alias_2d = unsafe_wrap(Array, pointer(arr_2d), size(arr_2d)) + # Create an array that is similar to `arr_3d` and fill it with constants. + # This does not call any new low-level functions, but it does illustrate + # that high-level functions such as `similar` and `fill!` fully functional. + arr_3d_sim = similar(arr_3d) + fill!(arr_3d_sim, 10) + return iterative_sum(arr) + iterative_sum(arr_2d) + iterative_sum(arr_3d) + iterative_sum(arr_4d) + iterative_sum(alias) + - iterative_sum(alias_2d) + iterative_sum(alias_2d) + + iterative_sum(arr_3d_sim) end function kernel(destination) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - unsafe_store!(destination, manipulate_array(), i) + for j in 1:3 + unsafe_store!(destination, manipulate_array(), i) + end return end From 85766d526ed761cad5631c3ca3277cc4cad442c4 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 11 Jun 2019 17:32:29 +0200 Subject: [PATCH 134/146] Update optim.jl to use stock Julia --- src/compiler/optim.jl | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index bacaac0d..976b700a 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -19,7 +19,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) # # NOTE: we need to use multiple distinct pass managers to force pass ordering; # intrinsics should never get lowered before Julia has optimized them. - if VERSION < v"1.2.0-DEV.375" + if VERSION < v"1.3.0-DEV.390" # with older versions of Julia, intrinsics are lowered unconditionally so we need to # replace them with GPU-compatible counterparts before anything else. that breaks # certain optimizations though: https://github.com/JuliaGPU/CUDAnative.jl/issues/340 @@ -44,7 +44,8 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) initialize!(pm) ccall(:jl_add_optimization_passes, Cvoid, (LLVM.API.LLVMPassManagerRef, Cint, Cint), - LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 1) + LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 0) + ccall(:LLVMExtraAddLateLowerGCFramePass, Cvoid, (LLVM.API.LLVMPassManagerRef,), LLVM.ref(pm)) run!(pm, mod) end @@ -62,11 +63,6 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) aggressive_dce!(pm) # remove dead uses of ptls add!(pm, ModulePass("LowerPTLS", lower_ptls!)) - # the Julia GC lowering pass also has some clean-up that is required - if VERSION >= v"1.2.0-DEV.531" - late_lower_gc_frame!(pm) - end - run!(pm, mod) end replace_malloc!(mod, job.malloc) From 2e640f5361a7de0a81a51656856dd6b936675b35 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 11 Jun 2019 17:34:47 +0200 Subject: [PATCH 135/146] Fix misnomer in utils.jl --- gc-benchmarks/utils.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index 822954a3..6ceca63d 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -78,14 +78,14 @@ benchmark_tags = [ macro cuda_benchmark(name, ex) esc(quote - local function register_gc(config, heap_size) + local function register_gc_shared(config, heap_size) register_cuda_benchmark($name, config) do global gc_mode = "gc" global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=heap_size) $(ex) end end - local function register_gc_shared(config, heap_size) + local function register_gc(config, heap_size) register_cuda_benchmark($name, config) do global gc_mode = "gc" local local_arena_initial_size = div(heap_size, 10) From bb7b44026e3bc5dcdab07f290309b5739cfee951 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 11 Jun 2019 17:43:09 +0200 Subject: [PATCH 136/146] Include mean in gc-heap-sizes.csv --- gc-benchmarks/run-all.jl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 46449f85..eb17da9b 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -1,4 +1,4 @@ -using CUDAdrv, CUDAnative, Test +using CUDAdrv, CUDAnative, Test, Statistics include("utils.jl") @@ -41,11 +41,20 @@ end open("gc-heap-sizes.csv", "w") do file ratio_tags = [t * "-ratio" for t in gc_tags] write(file, "benchmark,$(join(gc_tags, ',')),$(join(ratio_tags, ','))\n") + all_times = [[] for t in gc_tags] + all_normalized_times = [[] for t in gc_tags] for key in sort([k for k in keys(results)]) runs = results[key] median_times = BenchmarkTools.median(runs) times = [median_times[t].time / 1e6 for t in gc_tags] + for (l, val) in zip(all_times, times) + push!(l, val) + end normalized_times = [median_times[t].time / median_times["gc"].time for t in gc_tags] + for (l, val) in zip(all_normalized_times, normalized_times) + push!(l, val) + end write(file, "$key,$(join(times, ',')),$(join(normalized_times, ','))\n") end + write(file, "mean,$(join(map(mean, all_times), ',')),$(join(map(mean, all_normalized_times), ','))\n") end From be1692c14de80f246ad0a9bbb06008a382b23153 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 11 Jun 2019 17:48:44 +0200 Subject: [PATCH 137/146] Remove experimental allocator implementations --- src/gc.jl | 627 ------------------------------------------------------ 1 file changed, 627 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index f329c8e9..1fce27c4 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -107,42 +107,6 @@ end # Gets a free list arena's lock. get_lock(arena::Ptr{FreeListArena}) = ReaderWriterLock(@get_field_pointer(arena, :lock_state)) -# A data structure that describes a ScatterAlloc superblock. Every -# superblock is prefixed by one of these. -struct ScatterAllocSuperblock - # The number of regions in the superblock. - region_count::UInt32 - - # The number of pages in a region managed by this superblock. - pages_per_region::UInt32 - - # The size of a page in the superblock, in bytes. This size - # does not include the page's header. - page_size::UInt32 - - # A pointer to the next superblock. - next::Ptr{ScatterAllocSuperblock} -end - -# A region in a ScatterAlloc superblock. -struct ScatterAllocRegion - # The number of pages in this region that are full. - full_page_count::Int64 -end - -# A page in a ScatterAlloc region. -struct ScatterAllocPage - # The size of a chunk in this page. - chunk_size::Int64 - - # The number of allocated blocks in this page. - allocated_chunk_count::Int64 - - # A bitmask that describes which chunks have been allocated - # and which chunks are still free. - occupancy::Int64 -end - const gc_align = Csize_t(16) # Aligns a pointer to an alignment boundary. @@ -170,17 +134,6 @@ function align_upward(offset::T, alignment::Csize_t = gc_align)::T where T <: In convert(T, Csize_t(align_upward(convert(Ptr{UInt8}, Csize_t(offset)), alignment))) end -# Gets the page size in a superblock. This size does not include -# the page header. -function page_size(superblock::Ptr{ScatterAllocSuperblock}) - unsafe_load(@get_field_pointer(superblock, :page_size)) -end - -# Gets the number of pages per region in a superblock. -function pages_per_region(superblock::Ptr{ScatterAllocSuperblock}) - unsafe_load(@get_field_pointer(superblock, :pages_per_region)) -end - # Gets the size of an aligned header, including padding to satisfy # alignment requirements. @generated function header_size(::Type{T}, ::Val{alignment} = Val(gc_align))::UInt32 where {T, alignment} @@ -188,107 +141,6 @@ end :($result) end -# Gets the total number of chunks in a particular page. -function chunk_count(page::Ptr{ScatterAllocPage}, superblock::Ptr{ScatterAllocSuperblock}) - chunk_size = unsafe_load(@get_field_pointer(page, :chunk_size)) - div(page_size(superblock), chunk_size) -end - -# Gets the address of a particular chunk in a page. `index` is zero-based. -function chunk_address(page::Ptr{ScatterAllocPage}, index::Integer)::Ptr{UInt8} - chunk_size = unsafe_load(@get_field_pointer(page, :chunk_size)) - Base.unsafe_convert(Ptr{UInt8}, page + header_size(ScatterAllocPage) + chunk_size * index) -end - -# Gets the address of a particular page in a region. `index` is zero-based. -function page_address(region::Ptr{ScatterAllocRegion}, superblock::Ptr{ScatterAllocSuperblock}, index::Integer)::Ptr{ScatterAllocPage} - Base.unsafe_convert( - Ptr{ScatterAllocPage}, - region + header_size(ScatterAllocRegion) + index * (header_size(ScatterAllocPage) + page_size(superblock))) -end - -# Gets the total size in bytes of a region, including overhead. -function region_bytesize(pages_per_region::Integer, page_size::Integer) - region_data_size = pages_per_region * (header_size(ScatterAllocPage) + page_size) - header_size(ScatterAllocRegion) + region_data_size -end - -# Gets the address of a particular region in a superblock. `index` is zero-based. -function region_address(superblock::Ptr{ScatterAllocSuperblock}, index::Integer)::Ptr{ScatterAllocRegion} - Base.unsafe_convert( - Ptr{ScatterAllocPage}, - superblock + header_size(ScatterAllocSuperblock) + index * region_bytesize(pages_per_region(superblock), page_size(superblock))) -end - -# A GC arena that uses the ScatterAlloc algorithm for allocations. -struct ScatterAllocArena - # A pointer to the first superblock managed by this arena. - first_superblock::Ptr{ScatterAllocSuperblock} -end - -# A "shelf" in a bodega arena. See `BodegaArena` for more info on -# how shelves work. -struct BodegaShelf - # The size of the chunks on this shelf. - chunk_size::Csize_t - - # The maximal number of chunks on this shelf. - capacity::Int64 - - # An index into the shelf that points to the first free - # chunk. This is a zero-based index. - chunk_finger::Int64 - - # A pointer to an array of pointers to chunks of memory. - # Every chunk in this array has a chunk size that is - # at least as large as `chunk_size`. - chunks::Ptr{Ptr{UInt8}} -end - -# A GC arena that uses a custom ("bodega") allocation algorithm for allocations. -# Essentially, this type of arena has a list of "shelves" that contain small, -# preallocated chunks of memory that threads can claim in a fast and lock-free -# manner. When the shelves run out of memory, threads may re-stock them from free -# list, amortizing the cost of lock acquisition across many different allocations. -struct BodegaArena - # The number of shelves in the arena. - shelf_count::Int - - # A pointer to an array of shelves. - shelves::Ptr{BodegaShelf} - - # A Boolean that tells if it is sensible to try and restock shelves in this - # arena. Restocking shelves becomes futile once the free list's capacity is - # exhausted. - can_restock::Bool - - # The free list this bodega uses for large allocations and for re-stocking - # the shelves. - free_list::FreeListArena -end - -# Gets a pointer to a bodega arena's free list. -function get_free_list(arena::Ptr{BodegaArena})::Ptr{FreeListArena} - @get_field_pointer(arena, :free_list) -end - -# Gets a bodega arena's lock. -get_lock(arena::Ptr{BodegaArena}) = get_lock(get_free_list(arena)) - -# Gets the first shelf containing chunks that are at least `bytesize` bytes -# in size. Returns null if there is no such shelf. -function get_shelf(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{BodegaShelf} - bodega = unsafe_load(arena) - for i in 1:bodega.shelf_count - shelf = bodega.shelves + (i - 1) * sizeof(BodegaShelf) - chunk_size = unsafe_load(@get_field_pointer(shelf, :chunk_size)) - if chunk_size >= bytesize - return shelf - end - end - return C_NULL -end - # A reference to a Julia object. const ObjectRef = Ptr{Nothing} @@ -329,10 +181,6 @@ struct GCMasterRecord # The number of local arenas. local_arena_count::UInt32 - # A pointer to the tiny arena, which uses the ScatterAlloc - # algorithm to provision space for small objects. - tiny_arena::Ptr{ScatterAllocArena} - # A pointer to a list of local GC arena pointers. local_arenas::Ptr{Ptr{LocalArena}} @@ -569,181 +417,6 @@ function gc_add_to_free_list( unsafe_store!(list_ptr, entry) end -# Tries to allocate a chunk of memory from a ScatterAlloc page. -# Returns a null pointer if no chunk of memory can be found. -function gc_scatter_alloc_use_page( - page::Ptr{ScatterAllocPage}, - region::Ptr{ScatterAllocRegion}, - superblock::Ptr{ScatterAllocSuperblock})::Ptr{UInt8} - - alloc_chunk_ptr = @get_field_pointer(page, :allocated_chunk_count) - fill_level = atomic_add!(alloc_chunk_ptr, 1) - spots = chunk_count(page, superblock) - if fill_level < spots - if fill_level + 1 == spots - # The page is full now. Increment the region's counter. - full_page_ptr = @get_field_pointer(region, :full_page_count) - atomic_add!(full_page_ptr, 1) - end - - lane_id = (get_thread_id() - 1) % warpsize() - spot = lane_id % spots - occupancy_ptr = @get_field_pointer(page, :occupancy) - while true - # Check if our preferred spot is available. - mask = 1 << spot - old = atomic_or!(occupancy_ptr, mask) - - actual_fill = 0 - for i in 1:64 - if old & (1 << (i - 1)) != 0 - actual_fill += 1 - end - end - - # If the spot is available, then use it. - if old & mask == 0 - break - end - - # Otherwise, find a new spot. - spot = (spot + 1) % spots - end - return chunk_address(page, spot) - end - - # The page is full. - atomic_subtract!(alloc_chunk_ptr, 1) - return C_NULL -end - -function scatter_alloc_hash( - superblock::Ptr{ScatterAllocSuperblock}, - bytesize::Int64)::Int64 - - sb = unsafe_load(superblock) - page_count = sb.region_count * sb.pages_per_region - warp_id = get_warp_id() - 1 - - k_S = 38183 - k_mp = 17497 - - (bytesize * k_S + warp_id * k_mp) % page_count -end - -# Tries to allocate a chunk of memory from a ScatterAlloc superblock. -# Returns a null pointer if no sufficiently large chunk of -# memory can be found. -function gc_scatter_alloc_use_superblock( - superblock::Ptr{ScatterAllocSuperblock}, - bytesize::Csize_t)::Ptr{UInt8} - - if bytesize > page_size(superblock) - # This isn't going to work. The superblock's page size is just too small. - return C_NULL - end - - # Choose the allocation size in such a way that we never end up with more than - # 64 chunks. This is necessary because the chunk occupancy bitfield is only - # 64 bits wide. - alloc_size = Int64(div(page_size(superblock), 64)) - if alloc_size < Int64(bytesize) - alloc_size = Int64(bytesize) - end - - # Align the allocation size. - alloc_size = align_upward(alloc_size) - - # We are looking for a chunk that is `bytesize` bytes in size, - # but we're willing to accept a chunk that is twice as large. - waste_factor = 2 - max_size = alloc_size * waste_factor - - pages_per_region = unsafe_load(@get_field_pointer(superblock, :pages_per_region)) - region_count = unsafe_load(@get_field_pointer(superblock, :region_count)) - - # Guess a global page index. - global_page_id = scatter_alloc_hash(superblock, alloc_size) - - # Decompose that global page index into a region index and a - # local page index. - region_id = global_page_id % pages_per_region - page_id = div(global_page_id, pages_per_region) - - # Remember the initial values of the region and page ids. - init_region_id = region_id - init_page_id = page_id - - # Find the region and page corresponding to the current page ID. - region = region_address(superblock, region_id) - while true - page = page_address(region, superblock, page_id) - - # Skip regions until we find a region that is sufficiently empty. - while true - region_fill_level = unsafe_load(region).full_page_count / pages_per_region - if region_fill_level > 0.9 - region_id += 1 - if region_id >= region_count - region_id = 0 - end - region = region_address(superblock, region_id) - page_id = 0 - else - break - end - end - - # Try to set the chunk size to our preferred chunk size. - chunk_size_ptr = @get_field_pointer(page, :chunk_size) - chunk_size = atomic_compare_exchange!(chunk_size_ptr, 0, alloc_size) - if chunk_size == 0 || (chunk_size >= alloc_size && chunk_size <= max_size) - # If we managed to set the page's chunk size, then the page is definitely - # suitable for our purposes. Otherwise, the page might still be suitable - # if its chunk size is sufficiently large to accommodate the requested - # size yet small enough to not waste too much space. - result = gc_scatter_alloc_use_page(page, region, superblock) - if result != C_NULL - return result - end - end - - # Try the next page. - page_id += 1 - - if page_id >= pages_per_region - region_id += 1 - if region_id >= region_count - region_id = 0 - end - region = region_address(superblock, region_id) - page_id = 0 - end - - # We tried every page in the entire superblock and found nothing. - if region_id == init_region_id && page_id == init_page_id - return C_NULL - end - end -end - -# Tries to allocate a chunk of memory in a particular GC arena. -# Returns a null pointer if no sufficiently large chunk of -# memory can be found. -function gc_malloc_local(arena::Ptr{ScatterAllocArena}, bytesize::Csize_t)::Ptr{UInt8} - # Walk the list of superblocks until we find a valid candidate. - superblock = unsafe_load(arena).first_superblock - while superblock != C_NULL - result = gc_scatter_alloc_use_superblock(superblock, bytesize) - if result != C_NULL - return result - end - superblock = unsafe_load(@get_field_pointer(superblock, :next)) - end - - return C_NULL -end - # Tries to allocate a chunk of memory from a free list. # Returns a null pointer if no sufficiently large chunk of # memory can be found. @@ -823,101 +496,6 @@ function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t; acquire_l return result_ptr end -# Atomically takes a chunk from a shelf. Returns null if the shelf -# is empty. -function gc_malloc_from_shelf(shelf::Ptr{BodegaShelf})::Ptr{UInt8} - capacity = unsafe_load(@get_field_pointer(shelf, :capacity)) - - # Atomically increment the chunk finger. - finger_ptr = @get_field_pointer(shelf, :chunk_finger) - finger = atomic_add!(finger_ptr, 1) - - if finger < capacity - # If the chunk finger was less than the capacity, then we actually - # managed to take a chunk from the shelf. We only need to retrieve - # its address. - chunk_array = unsafe_load(@get_field_pointer(shelf, :chunks)) - return unsafe_load(chunk_array, finger + 1) - else - # Otherwise, we've got nothing. Return null. - return C_NULL - end -end - -# Re-stocks a shelf. -function restock_shelf(arena::Ptr{BodegaArena}, shelf::Ptr{BodegaShelf}) - shelf_size = unsafe_load(@get_field_pointer(shelf, :chunk_size)) - capacity = unsafe_load(@get_field_pointer(shelf, :capacity)) - finger_ptr = @get_field_pointer(shelf, :chunk_finger) - finger = unsafe_load(finger_ptr) - - # The finger may exceed the capacity. This is harmless. Just - # reset the finger to the capacity. - if finger > capacity - finger = capacity - end - - # Actually re-stock the shelf. - free_list = get_free_list(arena) - chunk_array = unsafe_load(@get_field_pointer(shelf, :chunks)) - while finger > 0 - chunk = gc_malloc_from_free_list(free_list, shelf_size) - if chunk == C_NULL - # We exhausted the free list. Better break now. Also set - # the arena's `can_restock` flag to false so there will be - # no future attempts to re-stock shelves. - unsafe_store!(@get_field_pointer(arena, :can_restock), false) - break - end - - # Update the chunk array. - unsafe_store!(chunk_array, chunk, finger) - finger -= 1 - end - - # Update the finger. - unsafe_store!(finger_ptr, finger) -end - -# Tries to allocate a chunk of memory in a particular GC arena. -# Returns a null pointer if no sufficiently large chunk of -# memory can be found. -function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t; acquire_lock=true)::Ptr{UInt8} - # The bodega arena might be empty (or approximately empty). If so, then we'll - # just return null early. There's no need to scrape the bottom of the barrel. - if !unsafe_load(@get_field_pointer(arena, :can_restock)) - return C_NULL - end - - # Find the right shelf for this allocation. - shelf = get_shelf(arena, bytesize) - free_list = get_free_list(arena) - if shelf == C_NULL - # The shelves' chunk sizes are all too small to accommodate this - # allocation. Use the free list directly. - return gc_malloc_local(free_list, bytesize) - end - - # Acquire a reader lock on the arena and try to take a chunk - # from the shelf. - lock = get_lock(free_list) - result_ptr = reader_locked(lock; acquire_lock=acquire_lock) do - gc_malloc_from_shelf(shelf) - end - - if result_ptr == C_NULL - # Looks like we need to re-stock the shelf. While we're at it, - # we might as well grab a chunk of memory for ourselves. - result_ptr = writer_locked(lock; acquire_lock=acquire_lock) do - restock_shelf(arena, shelf) - gc_malloc_from_free_list(free_list, bytesize) - end - end - - gc_protect(result_ptr) - return result_ptr -end - # Transfers a block of free memory from one arena to another and then # allocates a differently-sized block of memory from the destination # arena. @@ -944,28 +522,6 @@ function gc_transfer_and_malloc( end end -# Transfers a block of free memory from one arena to another and then -# allocates a differently-sized block of memory from the destination -# arena. -function gc_transfer_and_malloc( - from_arena::Ptr{FreeListArena}, - to_arena::Ptr{BodegaArena}, - transfer_bytesize::Csize_t, - alloc_bytesize::Csize_t)::Ptr{UInt8} - - result = gc_transfer_and_malloc( - from_arena, - get_free_list(to_arena), - transfer_bytesize, - alloc_bytesize) - - writer_locked(get_lock(to_arena)) do - unsafe_store!(@get_field_pointer(to_arena, :can_restock), true) - end - - return result -end - """ gc_malloc(bytesize::Csize_t)::Ptr{UInt8} @@ -976,16 +532,6 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8} master_record = get_gc_master_record() function allocate() - # Try to allocate in the tiny arena first. The ScatterAlloc - # algorithm used by that arena is lock-free and works well - # for small objects. - if master_record.tiny_arena != C_NULL - local_ptr = gc_malloc_local(master_record.tiny_arena, bytesize) - if local_ptr != C_NULL - return local_ptr - end - end - # Try to allocate in the local arena second. If that doesn't # work, we'll move on to the global arena, which is bigger but # is shared by all threads. (We want to minimize contention @@ -1126,14 +672,6 @@ end # One megabyte. const MiB = 1 << 20 -# The point at which a tiny arena is deemed to be starving, i.e., -# it no longer contains enough memory to perform basic allocations. -# If a tiny arena's free byte count stays below the arena starvation -# threshold after a collection phase, the collector will allocate -# additional memory to the arena such that it is no longer starving. -# This arena starvation threshold is currently set to 2 MiB. -const tiny_arena_starvation_threshold = 0 # 2 * MiB - # A description of a region of memory that has been allocated to the GC heap. const GCHeapRegion = CUDAdrv.Mem.HostBuffer @@ -1244,14 +782,6 @@ function gc_init!( # Compute a pointer to the start of the tiny arena. arena_start_ptr = rootbuf_ptr + rootbuf_bytesize - # Set up the tiny object arena. - if tiny_arena_starvation_threshold > 0 - arena_for_ants = make_gc_arena!(ScatterAllocArena, arena_start_ptr, Csize_t(tiny_arena_starvation_threshold)) - arena_start_ptr += tiny_arena_starvation_threshold - else - arena_for_ants = Base.unsafe_convert(Ptr{ScatterAllocArena}, C_NULL) - end - # Set up local arenas. for i in 1:config.local_arena_count local_arena = make_gc_arena!(LocalArena, arena_start_ptr, Csize_t(config.local_arena_initial_size)) @@ -1267,7 +797,6 @@ function gc_init!( UInt32(thread_count), UInt32(config.root_buffer_capacity), UInt32(config.local_arena_count), - arena_for_ants, local_arenas_ptr, global_arena, safepoint_ptr, @@ -1302,89 +831,6 @@ function make_gc_arena!(::Type{FreeListArena}, start_ptr::Ptr{T}, size::Csize_t) arena end -# Takes a zero-filled region of memory and turns it into an arena -# managed by the GC, prefixed with an arena record. -function make_gc_arena!(::Type{BodegaArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{BodegaArena} where T - current_ptr = start_ptr + sizeof(BodegaArena) - - # Set up some shelf chunk arrays - shelf_records = [] - for chunk_size in [32, 64] - capacity = 2048 - shelf_chunk_array = Base.unsafe_convert(Ptr{Ptr{UInt8}}, current_ptr) - current_ptr += capacity * sizeof(Ptr{UInt8}) - push!(shelf_records, BodegaShelf(Csize_t(chunk_size), capacity, capacity, shelf_chunk_array)) - end - - # Set up the shelves. - shelf_array = Base.unsafe_convert(Ptr{BodegaShelf}, current_ptr) - for record in shelf_records - shelf = Base.unsafe_convert(Ptr{BodegaShelf}, current_ptr) - current_ptr += sizeof(BodegaShelf) - unsafe_store!(shelf, record) - end - - # Set up a free list entry. - first_entry_ptr = make_gc_block!(current_ptr, Csize_t(start_ptr + size) - Csize_t(current_ptr)) - - # Set up the arena record. - arena = Base.unsafe_convert(Ptr{BodegaArena}, start_ptr) - unsafe_store!( - arena, - BodegaArena( - length(shelf_records), - shelf_array, - true, - FreeListArena(0, first_entry_ptr, C_NULL))) - - # Stock the shelves. - for record in shelf_records - restock_shelf(arena, get_shelf(arena, record.chunk_size)) - end - - arena -end - -# Takes a zero-filled region of memory and turns it into a ScatterAlloc -# superblock. -function make_gc_superblock!( - start_ptr::Ptr{T}, - size::Csize_t; - page_size::UInt32 = UInt32(2048), - pages_per_region::UInt32 = UInt32(16))::Ptr{ScatterAllocSuperblock} where T - - region_size = region_bytesize(pages_per_region, page_size) - - # Figure out how many regions we can allocate. - region_count = div(size - header_size(ScatterAllocSuperblock), region_size) - - # At this point, we'd normally allocate regions and pages. - # However, region and page headers are zero-initialized by default. - # So we don't actually need to do anything to set up the regions - # and pages. - - # Allocate the superblock header. - superblock = Base.unsafe_convert(Ptr{ScatterAllocSuperblock}, align_upward(start_ptr)) - unsafe_store!( - superblock, - ScatterAllocSuperblock(region_count, pages_per_region, page_size, C_NULL)) - - superblock -end - -# Takes a zero-filled region of memory and turns it into an arena -# managed by the GC, prefixed with an arena record. -function make_gc_arena!(::Type{ScatterAllocArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{ScatterAllocArena} where T - superblock_ptr = align_upward(start_ptr + sizeof(ScatterAllocArena)) - superblock = make_gc_superblock!(superblock_ptr, Csize_t(start_ptr) + size - Csize_t(superblock_ptr)) - arena = Base.unsafe_convert(Ptr{ScatterAllocArena}, start_ptr) - unsafe_store!( - arena, - ScatterAllocArena(superblock)) - - arena -end - # Tells if a GC heap contains a particular pointer. function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T for region in heap.regions @@ -1476,33 +922,6 @@ function iterate_allocated(fun::Function, arena::Ptr{FreeListArena}) iterate_allocation_records(fun, allocation_list_head) end -# Composes a set that contains all data addresses of chunks that -# are on the shelves. -function chunks_on_shelves(arena::Ptr{BodegaArena}) - arena_data = unsafe_load(arena) - chunks_on_shelves = Set{Ptr{UInt8}}() - for i in 1:arena_data.shelf_count - shelf = unsafe_load(arena_data.shelves, i) - for j in shelf.chunk_finger:(shelf.capacity - 1) - push!(chunks_on_shelves, unsafe_load(shelf.chunks, j)) - end - end - return chunks_on_shelves -end - -# Iterates through all active allocation records in a GC arena. -function iterate_allocated(fun::Function, arena::Ptr{BodegaArena}) - shelf_chunks = chunks_on_shelves(arena) - - # Now iterate through the allocation list, ignoring records that have - # been placed on the shelves. - iterate_allocated(get_free_list(arena)) do record - if !(data_pointer(record) in shelf_chunks) - fun(record) - end - end -end - # Iterates through all free allocation records in a GC arena. function iterate_free(fun::Function, arena::Ptr{FreeListArena}) free_list_head = unsafe_load(arena).free_list_head @@ -1544,22 +963,6 @@ function gc_free_garbage(arena::Ptr{FreeListArena}, live_blocks::Set{Ptr{FreeLis end end -# Frees all dead blocks in an arena. -function gc_free_garbage(arena::Ptr{BodegaArena}, live_blocks::Set{Ptr{FreeListRecord}}) - # Mark chunks on shelves as live. - all_live_blocks = Set{Ptr{FreeListRecord}}(live_blocks) - shelf_chunks = chunks_on_shelves(arena) - for chunk_ptr in shelf_chunks - push!(all_live_blocks, record_pointer(chunk_ptr)) - end - - # Free garbage in the free list sub-arena. - gc_free_garbage(get_free_list(arena), all_live_blocks) - - # Mark the arena as ready for restocking. - unsafe_store!(@get_field_pointer(arena, :can_restock), true) -end - # Compact a GC arena's free list. This function will # 1. merge adjancent free blocks, and # 2. reorder free blocks to put small blocks at the front @@ -1609,31 +1012,6 @@ function gc_compact(arena::Ptr{FreeListArena})::Csize_t return sum(map(record -> unsafe_load(record).size, records)) end -# Compact a GC arena's free list. This function will -# 1. merge adjancent free blocks, and -# 2. reorder free blocks to put small blocks at the front -# of the free list, -# 3. tally the total number of free bytes and return that number. -function gc_compact(arena::Ptr{BodegaArena})::Csize_t - # Compact the free list. - tally = gc_compact(get_free_list(arena)) - - # Add the size of the chunks on shelves to the tally. - shelf_count = unsafe_load(@get_field_pointer(arena, :shelf_count)) - for i in 1:shelf_count - shelf_array = unsafe_load(@get_field_pointer(arena, :shelves)) - shelf_data = unsafe_load(shelf_array, i) - - finger = shelf_data.chunk_finger - if finger > shelf_data.capacity - finger = shelf_data.capacity - end - tally += shelf_data.chunk_size * (shelf_data.capacity - finger) - end - - tally -end - # Expands a GC arena by assigning it an additional heap region. function gc_expand(arena::Ptr{FreeListArena}, region::GCHeapRegion) extra_record = make_gc_block!(pointer(region), Csize_t(sizeof(region))) @@ -1644,11 +1022,6 @@ function gc_expand(arena::Ptr{FreeListArena}, region::GCHeapRegion) unsafe_store!(last_free_list_ptr, extra_record) end -# Expands a GC arena by assigning it an additional heap region. -function gc_expand(arena::Ptr{BodegaArena}, region::GCHeapRegion) - gc_expand(get_free_list(arena), region) -end - """A report of the GC's actions.""" mutable struct GCReport """The total wall-clock time of a kernel execution.""" From 2c058c74b3f850c047223816fc0a6a0d35b13e75 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Tue, 11 Jun 2019 17:50:30 +0200 Subject: [PATCH 138/146] Remove binary tree example --- examples/binary-tree.jl | 176 ---------------------------------------- 1 file changed, 176 deletions(-) delete mode 100644 examples/binary-tree.jl diff --git a/examples/binary-tree.jl b/examples/binary-tree.jl deleted file mode 100644 index 812af535..00000000 --- a/examples/binary-tree.jl +++ /dev/null @@ -1,176 +0,0 @@ -using CUDAdrv, CUDAnative -using Random, Test -import Base: haskey, insert! - -# This example defines a kernel that constructs a binary search -# tree for a set of numbers and then proceeds to test membership -# in that tree for a sequence of other numbers. -# -# The main point of this example is to demonstrate that even -# naive, pointer-chasing programs can be compiled to GPU kernels. - -const use_gc = false - -"""A binary search tree node.""" -abstract type BinarySearchTreeNode{T} end - -"""An internal node of a binary search tree.""" -mutable struct InternalNode{T} <: BinarySearchTreeNode{T} - value::T - left::BinarySearchTreeNode{T} - right::BinarySearchTreeNode{T} -end - -InternalNode{T}(value::T) where T = InternalNode{T}(value, LeafNode{T}(), LeafNode{T}()) - -"""A leaf node of a binary search tree.""" -mutable struct LeafNode{T} <: BinarySearchTreeNode{T} end - -"""A binary search tree data structure.""" -mutable struct BinarySearchTree{T} - root::BinarySearchTreeNode{T} -end - -"""Creates an empty binary search tree.""" -BinarySearchTree{T}() where T = BinarySearchTree{T}(LeafNode{T}()) - -"""Tells if a binary search tree contains a particular element.""" -function haskey(tree::BinarySearchTree{T}, value::T)::Bool where T - walk = tree.root - while isa(walk, InternalNode{T}) - if walk.value == value - return true - elseif walk.value > value - walk = walk.right - else - walk = walk.left - end - end - return false -end - -"""Inserts an element into a binary search tree.""" -function insert!(tree::BinarySearchTree{T}, value::T) where T - if !isa(tree.root, InternalNode{T}) - tree.root = InternalNode{T}(value) - return - end - - walk = tree.root::InternalNode{T} - while true - if walk.value == value - return - elseif walk.value > value - right = walk.right - if isa(right, InternalNode{T}) - walk = right - else - walk.right = InternalNode{T}(value) - return - end - else - left = walk.left - if isa(left, InternalNode{T}) - walk = left - else - walk.left = InternalNode{T}(value) - return - end - end - end -end - -""" -Creates a binary search tree that contains elements copied from a device array. -""" -function BinarySearchTree{T}(elements::CUDAnative.DevicePtr{T}, size::Integer) where T - tree = BinarySearchTree{T}() - for i in 1:size - insert!(tree, unsafe_load(elements, i)) - end - tree -end - -""" -Creates a binary search tree that contains elements copied from an array. -""" -function BinarySearchTree{T}(elements::Array{T}) where T - tree = BinarySearchTree{T}() - for i in 1:length(elements) - insert!(tree, elements[i]) - end - tree -end - -# Gets a sequence of Fibonacci numbers. -function fibonacci(::Type{T}, count::Integer)::Array{T} where T - if count == 0 - return [] - elseif count == 1 - return [one(T)] - end - - results = [one(T), one(T)] - for i in 1:(count - 2) - push!(results, results[length(results) - 1] + results[length(results)]) - end - return results -end - -const number_count = 200 -const thread_count = 64 -const tests_per_thread = 2000 - -# Define a kernel that copies values using a temporary buffer. -function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - tree = BinarySearchTree{Int64}(a, number_count) - - for j in 1:tests_per_thread - offset = (i - 1) * tests_per_thread - index = offset + j - unsafe_store!(b, haskey(tree, unsafe_load(b, index)), index) - end - - return -end - -ccall((:ha_init_bytes, "/media/jonathan/Quark/School/CUDAnative.jl/libhalloc"), Cvoid, (Csize_t,), Csize_t(256 * 1024 * 1024)) - -# Generate a sequence of 64-bit truncated Fibonacci numbers. -number_set = fibonacci(Int64, number_count) -# Randomize the sequence's order. -shuffle!(number_set) - -# Generate numbers for which we will test membership in the sequence. -test_sequence = Array(1:(thread_count * tests_per_thread)) - -# Allocate two arrays. -source_array = Mem.alloc(Int64, length(number_set)) -destination_array = Mem.alloc(Int64, length(test_sequence)) -source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array) -destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array) - -# Fill the source and destination arrays. -Mem.upload!(source_array, number_set) -Mem.upload!(destination_array, test_sequence) - -if use_gc - # Run the kernel. - @cuda gc=true malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer) - - # Run it again. - Mem.upload!(destination_array, test_sequence) - stats = @cuda gc=true malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer) -else - # Run the kernel. - @cuda malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer) - - # Run it again and time it this time. - Mem.upload!(destination_array, test_sequence) - stats = CUDAdrv.@elapsed @cuda malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer) -end -println(stats) - -@test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence]) From 2f4f77333f3c885a7fd940feb0ba071d7bb9713b Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Wed, 12 Jun 2019 13:45:23 +0200 Subject: [PATCH 139/146] Update GC benchmark runner --- gc-benchmarks/array-features.jl | 2 +- gc-benchmarks/run-all.jl | 46 +++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl index c27a876a..045d52bc 100644 --- a/gc-benchmarks/array-features.jl +++ b/gc-benchmarks/array-features.jl @@ -91,7 +91,7 @@ end function kernel(destination) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - for j in 1:3 + for j in 1:2 unsafe_store!(destination, manipulate_array(), i) end return diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index eb17da9b..43fbac42 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -24,7 +24,7 @@ gc_tags = [t for t in benchmark_tags if startswith(t, "gc")] # Also write them to a CSV for further analysis. open("strategies.csv", "w") do file write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n") - for key in sort([k for k in keys(results)]) + for key in sort(collect(keys(results))) runs = results[key] median_times = BenchmarkTools.median(runs) gc_time = median_times["gc"].time / 1e6 @@ -43,7 +43,7 @@ open("gc-heap-sizes.csv", "w") do file write(file, "benchmark,$(join(gc_tags, ',')),$(join(ratio_tags, ','))\n") all_times = [[] for t in gc_tags] all_normalized_times = [[] for t in gc_tags] - for key in sort([k for k in keys(results)]) + for key in sort(collect(keys(results))) runs = results[key] median_times = BenchmarkTools.median(runs) times = [median_times[t].time / 1e6 for t in gc_tags] @@ -58,3 +58,45 @@ open("gc-heap-sizes.csv", "w") do file end write(file, "mean,$(join(map(mean, all_times), ',')),$(join(map(mean, all_normalized_times), ','))\n") end + +open("gc-heap-sizes-summary.csv", "w") do file + write(file, "heap,mean-opt,mean-shared\n") + shared = Dict() + sizes = Dict() + for tag in gc_tags + shared[tag] = false + sizes[tag] = 60.0 + for part in split(tag, "-") + if endswith(part, "mb") + sizes[tag] = parse(Float64, part[1:end - 2]) + elseif part == "shared" + shared[tag] = true + end + end + end + + all_normalized_times = [[] for t in gc_tags] + for key in sort(collect(keys(results))) + runs = results[key] + median_times = BenchmarkTools.median(runs) + normalized_times = [median_times[t].time / median_times["gc"].time for t in gc_tags] + for (l, val) in zip(all_normalized_times, normalized_times) + push!(l, val) + end + end + + unique_sizes = sort(unique(values(sizes))) + data = zeros(Float64, (2, length(unique_sizes))) + for (tag, vals) in zip(gc_tags, all_normalized_times) + if shared[tag] + shared_index = 2 + else + shared_index = 1 + end + size_index = indexin(sizes[tag], unique_sizes)[1] + data[shared_index, size_index] = mean(vals) + end + for i in 1:length(unique_sizes) + write(file, "$(unique_sizes[i]),$(data[1, i]),$(data[2, i])\n") + end +end From 350f0ed34527729bad5b467311c668f03559343e Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 14 Jun 2019 11:55:16 +0200 Subject: [PATCH 140/146] Tweak benchmarks --- gc-benchmarks/array-expansion.jl | 2 +- gc-benchmarks/run-all.jl | 18 ++++++-------- gc-benchmarks/utils.jl | 41 +++++++++++++++++++++++--------- 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/gc-benchmarks/array-expansion.jl b/gc-benchmarks/array-expansion.jl index 76abf14a..f7b43075 100644 --- a/gc-benchmarks/array-expansion.jl +++ b/gc-benchmarks/array-expansion.jl @@ -7,7 +7,7 @@ using CUDAdrv, CUDAnative const thread_count = 256 const array_length = 200 -const runs = 10 +const runs = 5 function iterative_sum(elements::Array{T})::T where T result = zero(T) diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 43fbac42..60822434 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -4,7 +4,6 @@ include("utils.jl") include("array-expansion.jl") include("array-features.jl") -include("array-reduction.jl") include("arrays.jl") include("binary-tree.jl") include("bitvector.jl") @@ -26,11 +25,10 @@ open("strategies.csv", "w") do file write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n") for key in sort(collect(keys(results))) runs = results[key] - median_times = BenchmarkTools.median(runs) - gc_time = median_times["gc"].time / 1e6 - gc_shared_time = median_times["gc-shared"].time / 1e6 - nogc_time = median_times["nogc"].time / 1e6 - bump_time = median_times["bump"].time / 1e6 + gc_time = runs["gc"] / 1e6 + gc_shared_time = runs["gc-shared"] / 1e6 + nogc_time = runs["nogc"] / 1e6 + bump_time = runs["bump"] / 1e6 gc_ratio = gc_time / nogc_time gc_shared_ratio = gc_shared_time / nogc_time bump_ratio = bump_time / nogc_time @@ -45,12 +43,11 @@ open("gc-heap-sizes.csv", "w") do file all_normalized_times = [[] for t in gc_tags] for key in sort(collect(keys(results))) runs = results[key] - median_times = BenchmarkTools.median(runs) - times = [median_times[t].time / 1e6 for t in gc_tags] + times = [runs[t] / 1e6 for t in gc_tags] for (l, val) in zip(all_times, times) push!(l, val) end - normalized_times = [median_times[t].time / median_times["gc"].time for t in gc_tags] + normalized_times = [runs[t] / runs["gc"] for t in gc_tags] for (l, val) in zip(all_normalized_times, normalized_times) push!(l, val) end @@ -78,8 +75,7 @@ open("gc-heap-sizes-summary.csv", "w") do file all_normalized_times = [[] for t in gc_tags] for key in sort(collect(keys(results))) runs = results[key] - median_times = BenchmarkTools.median(runs) - normalized_times = [median_times[t].time / median_times["gc"].time for t in gc_tags] + normalized_times = [runs[t] / runs["gc"] for t in gc_tags] for (l, val) in zip(all_normalized_times, normalized_times) push!(l, val) end diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index 6ceca63d..4598c743 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -1,4 +1,4 @@ -import BenchmarkTools +import BenchmarkTools, JSON function get_gc_mode() try @@ -59,25 +59,26 @@ macro cuda_sync(args...) end) end -suite = BenchmarkTools.BenchmarkGroup() +suites = Dict() function register_cuda_benchmark(f, name, config) - suite[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 + suites[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90 end const MiB = 1 << 20 benchmark_tags = [ "gc", "gc-shared", + "gc-45mb", "gc-shared-45mb", "gc-30mb", "gc-shared-30mb", "gc-15mb", "gc-shared-15mb", - "gc-7.5mb", "gc-shared-7.5mb", - "gc-3.75mb", "gc-shared-3.75mb", + "gc-10mb", "gc-shared-10mb", "nogc", "bump" ] macro cuda_benchmark(name, ex) esc(quote + local suite = BenchmarkTools.BenchmarkGroup() local function register_gc_shared(config, heap_size) register_cuda_benchmark($name, config) do global gc_mode = "gc" @@ -98,17 +99,17 @@ macro cuda_benchmark(name, ex) end end - suite[$name] = BenchmarkTools.BenchmarkGroup(benchmark_tags) + suites[$name] = BenchmarkTools.BenchmarkGroup(benchmark_tags) register_gc("gc", 60 * MiB) register_gc_shared("gc-shared", 60 * MiB) + register_gc("gc-45mb", 45 * MiB) + register_gc_shared("gc-shared-45mb", 45 * MiB) register_gc("gc-30mb", 30 * MiB) register_gc_shared("gc-shared-30mb", 30 * MiB) register_gc("gc-15mb", 15 * MiB) register_gc_shared("gc-shared-15mb", 15 * MiB) - register_gc("gc-7.5mb", div(15 * MiB, 2)) - register_gc_shared("gc-shared-7.5mb", div(15 * MiB, 2)) - register_gc("gc-3.75mb", div(15 * MiB, 4)) - register_gc_shared("gc-shared-3.75mb", div(15 * MiB, 4)) + register_gc("gc-10mb", 10 * MiB) + register_gc_shared("gc-shared-10mb", 10 * MiB) register_cuda_benchmark($name, "nogc") do global gc_mode = "nogc" $(ex) @@ -121,7 +122,25 @@ macro cuda_benchmark(name, ex) end function run_benchmarks() - BenchmarkTools.run(suite) + cache_dir = mkpath("gc-benchmarks/results-cache") + results = Dict() + for (name, group) in pairs(suites) + cache_path = "$cache_dir/$(replace(name, " " => "-")).json" + if isfile(cache_path) + group_results = open(cache_path, "r") do file + JSON.parse(file) + end + else + runs = BenchmarkTools.run(group) + median_times = BenchmarkTools.median(runs) + group_results = Dict(k => r.time for (k, r) in pairs(median_times)) + open(cache_path, "w") do file + JSON.print(file, group_results) + end + end + results[name] = group_results + end + return results end module CUDArandom From 93a2f57a93c49f8a75d8ae073437fa6236ba7217 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 14 Jun 2019 16:56:03 +0200 Subject: [PATCH 141/146] Add a mean to 'strategies.csv' too --- strategies.csv | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 strategies.csv diff --git a/strategies.csv b/strategies.csv new file mode 100644 index 00000000..baa41e09 --- /dev/null +++ b/strategies.csv @@ -0,0 +1,13 @@ +benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio +array expansion,517.36013,145.260881,395.999929,11.459003,1.0,0.28077324203548504,0.7654241330888795,0.02214898739877771 +array features,236.641882,134.679289,330.545038,18.5354245,1.0,0.5691270195357896,1.3968154546708682,0.07832689777205204 +arrays,5001.1374265,888.7860235,1356.95648,4.4352555,1.0,0.17771677674572697,0.2713295725107985,0.0008868493548084666 +binary tree,1993.06871,571.320489,950.1915835,33.504988,1.0,0.28665368440810046,0.4767480311805206,0.016810754105913386 +bitvector,3095.321124,690.0682245,3010.889644,25.915606,1.0,0.22293913841425392,0.9727228689309975,0.008372509656287282 +genetic algo,274.7332775,173.877077,936.568618,4.434724,1.0,0.6328941240108782,3.4090104647042625,0.01614192514410636 +linked list,3983.6005275,712.9961405,711.111524,4.4301,1.0,0.1789828411704366,0.17850974742346326,0.0011120843993813333 +matrix,52.3009975,152.4938,157.114092,36.3442955,1.0,2.9156958239658812,3.004036242329795,0.6949063543195328 +ssa opt,682.361637,238.392158,1425.6750165,4.306012,1.0,0.3493633655140551,2.0893246911827785,0.006310454407916839 +static arrays,454.761664,131.938243,180.566625,6.793038,1.0,0.29012613297148987,0.3970577102119144,0.014937578379517936 +stream queries,5138.209838,578.285759,4061.109831,4.434645,1.0,0.11254615463993824,0.7903744609583226,0.0008630719919617265 +mean,1948.1361103636364,401.64528040909096,1228.7934891818181,14.053917409090907,1.0,0.5469834821283668,1.2501230342902363,0.07825613335729599 From 27822446c87d4a0efa475393fd73a44c4a2a6f76 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 14 Jun 2019 17:07:07 +0200 Subject: [PATCH 142/146] Remove strategies.csv from root dir --- gc-benchmarks/run-all.jl | 14 +++++++++++++- strategies.csv | 13 ------------- 2 files changed, 13 insertions(+), 14 deletions(-) delete mode 100644 strategies.csv diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 60822434..185af2d7 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -23,6 +23,17 @@ gc_tags = [t for t in benchmark_tags if startswith(t, "gc")] # Also write them to a CSV for further analysis. open("strategies.csv", "w") do file write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n") + all_results = [] + function write_line(key, results) + if length(all_results) == 0 + all_results = [Float64[] for _ in results] + end + write(file, "$key,$(join(results, ','))\n") + for (l, val) in zip(all_results, results) + push!(l, val) + end + end + for key in sort(collect(keys(results))) runs = results[key] gc_time = runs["gc"] / 1e6 @@ -32,8 +43,9 @@ open("strategies.csv", "w") do file gc_ratio = gc_time / nogc_time gc_shared_ratio = gc_shared_time / nogc_time bump_ratio = bump_time / nogc_time - write(file, "$key,$nogc_time,$gc_time,$gc_shared_time,$bump_time,1,$gc_ratio,$gc_shared_ratio,$bump_ratio\n") + write_line(key, [nogc_time, gc_time, gc_shared_time, bump_time, 1.0, gc_ratio, gc_shared_ratio, bump_ratio]) end + write_line("mean", mean.(all_results)) end open("gc-heap-sizes.csv", "w") do file diff --git a/strategies.csv b/strategies.csv deleted file mode 100644 index baa41e09..00000000 --- a/strategies.csv +++ /dev/null @@ -1,13 +0,0 @@ -benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio -array expansion,517.36013,145.260881,395.999929,11.459003,1.0,0.28077324203548504,0.7654241330888795,0.02214898739877771 -array features,236.641882,134.679289,330.545038,18.5354245,1.0,0.5691270195357896,1.3968154546708682,0.07832689777205204 -arrays,5001.1374265,888.7860235,1356.95648,4.4352555,1.0,0.17771677674572697,0.2713295725107985,0.0008868493548084666 -binary tree,1993.06871,571.320489,950.1915835,33.504988,1.0,0.28665368440810046,0.4767480311805206,0.016810754105913386 -bitvector,3095.321124,690.0682245,3010.889644,25.915606,1.0,0.22293913841425392,0.9727228689309975,0.008372509656287282 -genetic algo,274.7332775,173.877077,936.568618,4.434724,1.0,0.6328941240108782,3.4090104647042625,0.01614192514410636 -linked list,3983.6005275,712.9961405,711.111524,4.4301,1.0,0.1789828411704366,0.17850974742346326,0.0011120843993813333 -matrix,52.3009975,152.4938,157.114092,36.3442955,1.0,2.9156958239658812,3.004036242329795,0.6949063543195328 -ssa opt,682.361637,238.392158,1425.6750165,4.306012,1.0,0.3493633655140551,2.0893246911827785,0.006310454407916839 -static arrays,454.761664,131.938243,180.566625,6.793038,1.0,0.29012613297148987,0.3970577102119144,0.014937578379517936 -stream queries,5138.209838,578.285759,4061.109831,4.434645,1.0,0.11254615463993824,0.7903744609583226,0.0008630719919617265 -mean,1948.1361103636364,401.64528040909096,1228.7934891818181,14.053917409090907,1.0,0.5469834821283668,1.2501230342902363,0.07825613335729599 From 7380683bcfefa8d73fac230b3df80f3381f18146 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 17 Jun 2019 10:14:15 +0200 Subject: [PATCH 143/146] Include array reduction benchmark in GC benchmark suite --- gc-benchmarks/run-all.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 185af2d7..6d4b3c4d 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -4,6 +4,7 @@ include("utils.jl") include("array-expansion.jl") include("array-features.jl") +include("array-reduction.jl") include("arrays.jl") include("binary-tree.jl") include("bitvector.jl") From c6390edda89caad1b774eaad4af3a7f3b00530a4 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Sat, 22 Jun 2019 16:15:05 +0200 Subject: [PATCH 144/146] Insert a root buffer overflow check --- src/gc.jl | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/gc.jl b/src/gc.jl index 1fce27c4..0564097b 100644 --- a/src/gc.jl +++ b/src/gc.jl @@ -264,11 +264,21 @@ Registers a GC frame with the garbage collector. @inline function push_gc_frame(gc_frame::GCFrame, size::UInt32) master_record = get_gc_master_record() + threadid = get_thread_id() + next_rootbuf_start = master_record.root_buffers + threadid * master_record.root_buffer_capacity * sizeof(Ptr{ObjectRef}) + new_rootbuf_finger = gc_frame + size * sizeof(ObjectRef) + + # Check that we have enough room to push the GC frame. + if new_rootbuf_finger >= next_rootbuf_start + @cuprintf("Root buffer overflow in thread %ld.\n", threadid) + return + end + # Update the root buffer tip. unsafe_store!( master_record.root_buffer_fingers, - gc_frame + size * sizeof(ObjectRef), - get_thread_id()) + new_rootbuf_finger, + threadid) return end From a91baefa9118ea07d22788037714b84904ed897d Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 5 Jul 2019 12:23:51 +0200 Subject: [PATCH 145/146] Update benchmarks with pinned memory bump allocator --- gc-benchmarks/run-all.jl | 6 ++++-- gc-benchmarks/utils.jl | 14 +++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl index 6d4b3c4d..359d80bc 100644 --- a/gc-benchmarks/run-all.jl +++ b/gc-benchmarks/run-all.jl @@ -23,7 +23,7 @@ gc_tags = [t for t in benchmark_tags if startswith(t, "gc")] # Also write them to a CSV for further analysis. open("strategies.csv", "w") do file - write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n") + write(file, "benchmark,nogc,gc,gc-shared,bump,bump-pinned,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio,bump-pinned-ratio\n") all_results = [] function write_line(key, results) if length(all_results) == 0 @@ -41,10 +41,12 @@ open("strategies.csv", "w") do file gc_shared_time = runs["gc-shared"] / 1e6 nogc_time = runs["nogc"] / 1e6 bump_time = runs["bump"] / 1e6 + bump_pinned_time = runs["bump-pinned"] / 1e6 gc_ratio = gc_time / nogc_time gc_shared_ratio = gc_shared_time / nogc_time bump_ratio = bump_time / nogc_time - write_line(key, [nogc_time, gc_time, gc_shared_time, bump_time, 1.0, gc_ratio, gc_shared_ratio, bump_ratio]) + bump_pinned_ratio = bump_pinned_time / nogc_time + write_line(key, [nogc_time, gc_time, gc_shared_time, bump_time, bump_pinned_time, 1.0, gc_ratio, gc_shared_ratio, bump_ratio, bump_pinned_ratio]) end write_line("mean", mean.(all_results)) end diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index 4598c743..89c30271 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -44,9 +44,13 @@ macro cuda_sync(args...) local mode = get_gc_mode() if mode == "gc" CUDAnative.@cuda gc=true gc_config=gc_config $(args...) - elseif mode == "bump" + elseif startswith(mode, "bump") local capacity = 60 * MiB - local buf = Mem.alloc(Mem.DeviceBuffer, capacity) + if mode == "bump" + local buf = Mem.alloc(Mem.DeviceBuffer, capacity) + else + local buf = Mem.alloc(Mem.HostBuffer, capacity) + end local start_address = pointer(buf) local function init(kernel) CUDAnative.Runtime.bump_alloc_init!(kernel, start_address, capacity) @@ -73,7 +77,7 @@ benchmark_tags = [ "gc-30mb", "gc-shared-30mb", "gc-15mb", "gc-shared-15mb", "gc-10mb", "gc-shared-10mb", - "nogc", "bump" + "nogc", "bump", "bump-pinned" ] macro cuda_benchmark(name, ex) @@ -118,6 +122,10 @@ macro cuda_benchmark(name, ex) global gc_mode = "bump" $(ex) end + register_cuda_benchmark($name, "bump-pinned") do + global gc_mode = "bump-pinned" + $(ex) + end end) end From 4b76aec1a559157e5d55a1838e61d66dfe8ebb8e Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Sat, 6 Jul 2019 19:30:42 +0200 Subject: [PATCH 146/146] Write breakdown-computing code --- gc-benchmarks/run-breakdown.jl | 108 +++++++++++++++++++++++++++++++++ gc-benchmarks/utils-common.jl | 66 ++++++++++++++++++++ gc-benchmarks/utils.jl | 71 +--------------------- 3 files changed, 176 insertions(+), 69 deletions(-) create mode 100644 gc-benchmarks/run-breakdown.jl create mode 100644 gc-benchmarks/utils-common.jl diff --git a/gc-benchmarks/run-breakdown.jl b/gc-benchmarks/run-breakdown.jl new file mode 100644 index 00000000..1d1bd5b9 --- /dev/null +++ b/gc-benchmarks/run-breakdown.jl @@ -0,0 +1,108 @@ +using CUDAdrv, CUDAnative, Test, Statistics, JSON + +include("utils-common.jl") + +const benchmarks = Dict() +global benchmark_results = Dict() +global current_benchmark = nothing + +macro cuda_sync(args...) + esc(quote + local heap_size = 10 * MiB + local local_arena_initial_size = div(heap_size, 10) + local global_arena_initial_size = heap_size - 8 * local_arena_initial_size + local gc_config = GCConfiguration( + local_arena_count=8, + local_arena_initial_size=local_arena_initial_size, + global_arena_initial_size=global_arena_initial_size) + local result = CUDAnative.@cuda gc=true gc_config=gc_config $(args...) + push!(benchmark_results[current_benchmark], result) + end) +end + +macro cuda_benchmark(name, ex) + esc(quote + benchmarks[$name] = (() -> $(ex)) + end) +end + +include("array-expansion.jl") +include("array-features.jl") +include("array-reduction.jl") +include("arrays.jl") +include("binary-tree.jl") +include("bitvector.jl") +include("linked-list.jl") +include("matrix.jl") +include("ssa-opt.jl") +include("static-arrays.jl") +include("stream-queries.jl") +include("genetic-algorithm.jl") + +function run_benchmarks() + cache_dir = mkpath("gc-benchmarks/breakdown-cache") + global benchmark_results = Dict() + results = Dict() + for (k, v) in pairs(benchmarks) + println(k) + cache_path = "$cache_dir/$(replace(k, " " => "-")).json" + if isfile(cache_path) + results[k] = open(cache_path, "r") do file + JSON.parse(file) + end + else + # Perform a dry run to ensure that compilations are cached. + global current_benchmark = k + benchmark_results[k] = [] + v() + + # Run the benchmarks for real. + benchmark_results[k] = [] + v() + while sum(map(x -> x.elapsed_time, benchmark_results[k])) < 90 + v() + end + + results[k] = [ + Dict( + "elapsed-time" => r.elapsed_time, + "collection-count" => r.collection_count, + "collection-poll-time" => r.collection_poll_time, + "collection-time" => r.collection_time) + for (k, r) in pairs(benchmark_results[k])] + + open(cache_path, "w") do file + JSON.print(file, results[k]) + end + end + end + return results +end + +results = run_benchmarks() +# Write results to a CSV file for further analysis. +open("breakdown.csv", "w") do file + write(file, "benchmark,collection-poll-ratio,collection-ratio,other-ratio\n") + all_results = [] + function write_line(key, results) + if length(all_results) == 0 + all_results = [Float64[] for _ in results] + end + write(file, "$key,$(join(results, ','))\n") + for (l, val) in zip(all_results, results) + push!(l, val) + end + end + + for key in sort(collect(keys(results))) + runs = results[key] + total_time = mean(getindex.(runs, "elapsed-time")) + poll_time = mean(getindex.(runs, "collection-poll-time")) + collection_time = mean(getindex.(runs, "collection-time")) + poll_ratio = poll_time / total_time + collection_ratio = collection_time / total_time + other_ratio = 1.0 - poll_ratio - collection_ratio + write_line(key, [poll_time, collection_ratio, other_ratio]) + end + write_line("mean", mean.(all_results)) +end diff --git a/gc-benchmarks/utils-common.jl b/gc-benchmarks/utils-common.jl new file mode 100644 index 00000000..334ae3c3 --- /dev/null +++ b/gc-benchmarks/utils-common.jl @@ -0,0 +1,66 @@ +module CUDArandom + +# A linear congruential pseudo-random number generator. +mutable struct LinearCongruentialGenerator + modulus::Int + a::Int + c::Int + state::Int +end + +LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed) + +# Requests a pseudo-random number. +function next(generator::LinearCongruentialGenerator)::Int + generator.state = (generator.a * generator.state + generator.c) % generator.modulus + generator.state +end + +# Requests a pseudo-random number that is at least as great as `lower` +# and less than `upper`. +function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int + lower + next(generator) % (upper - lower) +end + +end + +function upload!(destination, source) + Mem.copy!(destination, pointer(source), sizeof(source)) +end + +function download(::Type{T}, source, dims) where T + result = Array{T}(undef, dims) + Mem.copy!(pointer(result), source, sizeof(result)) + result +end + +const MiB = 1 << 20 +const CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 +const BENCHMARK_HEAP_SIZE = 64 * MiB + +function set_malloc_heap_size(size::Integer) + CUDAdrv.@apicall( + :cuCtxSetLimit, + (Cint, Csize_t), + CU_LIMIT_MALLOC_HEAP_SIZE, + Csize_t(size)) +end + +""" + @sync ex +Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly +synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As +such, this operation is preferred over implicit synchronization (e.g. when performing a +memory copy) for high-performance applications. +It is also useful for timing code that executes asynchronously. +""" +macro sync(ex) + # Copied from https://github.com/JuliaGPU/CuArrays.jl/blob/8e45a27f2b12796f47683340845f98f017865676/src/utils.jl#L68-L86 + quote + local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING) + local ret = $(esc(ex)) + CUDAdrv.record(e) + CUDAdrv.synchronize(e) + ret + end +end diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl index 89c30271..4fe2b540 100644 --- a/gc-benchmarks/utils.jl +++ b/gc-benchmarks/utils.jl @@ -1,5 +1,7 @@ import BenchmarkTools, JSON +include("utils-common.jl") + function get_gc_mode() try return gc_mode @@ -8,37 +10,6 @@ function get_gc_mode() end end -const MiB = 1 << 20 -const CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 -const BENCHMARK_HEAP_SIZE = 64 * MiB - -function set_malloc_heap_size(size::Integer) - CUDAdrv.@apicall( - :cuCtxSetLimit, - (Cint, Csize_t), - CU_LIMIT_MALLOC_HEAP_SIZE, - Csize_t(size)) -end - -""" - @sync ex -Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly -synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As -such, this operation is preferred over implicit synchronization (e.g. when performing a -memory copy) for high-performance applications. -It is also useful for timing code that executes asynchronously. -""" -macro sync(ex) - # Copied from https://github.com/JuliaGPU/CuArrays.jl/blob/8e45a27f2b12796f47683340845f98f017865676/src/utils.jl#L68-L86 - quote - local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING) - local ret = $(esc(ex)) - CUDAdrv.record(e) - CUDAdrv.synchronize(e) - ret - end -end - macro cuda_sync(args...) esc(quote local mode = get_gc_mode() @@ -69,8 +40,6 @@ function register_cuda_benchmark(f, name, config) suites[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90 end -const MiB = 1 << 20 - benchmark_tags = [ "gc", "gc-shared", "gc-45mb", "gc-shared-45mb", @@ -150,39 +119,3 @@ function run_benchmarks() end return results end - -module CUDArandom - -# A linear congruential pseudo-random number generator. -mutable struct LinearCongruentialGenerator - modulus::Int - a::Int - c::Int - state::Int -end - -LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed) - -# Requests a pseudo-random number. -function next(generator::LinearCongruentialGenerator)::Int - generator.state = (generator.a * generator.state + generator.c) % generator.modulus - generator.state -end - -# Requests a pseudo-random number that is at least as great as `lower` -# and less than `upper`. -function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int - lower + next(generator) % (upper - lower) -end - -end - -function upload!(destination, source) - Mem.copy!(destination, pointer(source), sizeof(source)) -end - -function download(::Type{T}, source, dims) where T - result = Array{T}(undef, dims) - Mem.copy!(pointer(result), source, sizeof(result)) - result -end